Skip to content

Commit 9a09f80

Browse files
kopporclaude
andcommitted
Add plain-text citation hover for non-hyperlinked PDFs
Port of the add-text-based-reference-hovering work onto the refactored detector architecture (RefHoverDetect): - Pure pattern matching is unit-testable in RefHoverDetect.cpp: DetectCitationInPageText finds "(Surname et al., 2020)" / "Surname (2020)" at the cursor; FindSurnameInPageText locates the matching bibliography entry on a page. - RefHover.cpp adds the engine-driven page walk, the per-document lookup cache (positive and negative results), and the multi-word / fragmented-surname fallback. - Canvas.cpp falls back to plain-text detection when no link element is under the cursor. - DetectEntryBox gains the Word-style PDF robustness fixes: major-glyph newline tracking, measured-line-height gating for the paragraph-gap rule, and a 6-line cap for author-year entries without hanging indent. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 50ec145 commit 9a09f80

6 files changed

Lines changed: 790 additions & 8 deletions

File tree

src/Canvas.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,8 @@ static void OnMouseMove(MainWindow* win, int x, int y, WPARAM) {
895895
case MouseAction::None: {
896896
Annotation* annot = dm->GetAnnotationAtPos(pos, nullptr);
897897
Annotation* prev = win->annotationUnderCursor;
898-
IPageElement* el = dm->GetElementAtPos(pos, nullptr);
898+
int srcPageNo = -1;
899+
IPageElement* el = dm->GetElementAtPos(pos, &srcPageNo);
899900
// the annotation notification below is suppressed in favor of
900901
// the citation hover popup, but only when that feature is on
901902
bool citationHoverEnabled = gGlobalPrefs->citationHoverDelay >= 0;
@@ -935,6 +936,7 @@ static void OnMouseMove(MainWindow* win, int x, int y, WPARAM) {
935936
if (!win->refHover) {
936937
win->refHover = RefHoverCreate(win->hwndCanvas);
937938
}
939+
bool scheduled = false;
938940
if (win->refHover && hasInternalLink) {
939941
// request WM_MOUSELEAVE so popup hides when cursor leaves canvas
940942
TrackMouseLeave(win->hwndCanvas);
@@ -958,7 +960,37 @@ static void OnMouseMove(MainWindow* win, int x, int y, WPARAM) {
958960
int delayMs = gGlobalPrefs->citationHoverDelay;
959961
RefHoverSchedule(win->refHover, win->hwndCanvas, delayMs, screenPt, destPage, destPt.x, destPt.y,
960962
destZoom, srcPage, srcRect, pageScreenRect);
961-
} else if (win->refHover) {
963+
scheduled = true;
964+
} else if (win->refHover && srcPageNo > 0) {
965+
// No link element under cursor — try plain-text citation
966+
// detection ("(Smith et al., 2020)" style references in
967+
// PDFs without hyperref). Convert cursor from screen
968+
// coords to page coords before searching the text cache.
969+
PointF pagePtF = dm->CvtFromScreen(pos, srcPageNo);
970+
Point pagePt{(int)pagePtF.x, (int)pagePtF.y};
971+
int destPage = -1;
972+
float destX = -1.f, destY = -1.f;
973+
if (RefHoverTryPlainText(win->refHover, dm->GetEngine(), srcPageNo, pagePt, destPage, destX,
974+
destY)) {
975+
TrackMouseLeave(win->hwndCanvas);
976+
Point screenPt = {x, y};
977+
ClientToScreen(win->hwndCanvas, (POINT*)&screenPt);
978+
Rect pageScreenRect{};
979+
PageInfo* pi = dm->GetPageInfo(srcPageNo);
980+
if (pi && !pi->pageOnScreen.IsEmpty()) {
981+
pageScreenRect = pi->pageOnScreen;
982+
POINT topLeft = {pageScreenRect.x, pageScreenRect.y};
983+
ClientToScreen(win->hwndCanvas, &topLeft);
984+
pageScreenRect.x = topLeft.x;
985+
pageScreenRect.y = topLeft.y;
986+
}
987+
int delayMs = gGlobalPrefs->citationHoverDelay;
988+
RefHoverSchedule(win->refHover, win->hwndCanvas, delayMs, screenPt, destPage, destX, destY,
989+
0.f, srcPageNo, RectF{}, pageScreenRect);
990+
scheduled = true;
991+
}
992+
}
993+
if (!scheduled && win->refHover) {
962994
RefHoverHide(win->refHover, win->hwndCanvas);
963995
}
964996
} else if (win->refHover) {

src/RefHover.cpp

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,61 @@
7171

7272
#define REF_HOVER_CLASS L"SumatraPDFRefHover"
7373

74+
// === Plain-text citation lookup cache ===
75+
// Keyed by (surname, year, srcPage) so the same citation hovered repeatedly
76+
// is resolved instantly. Negative results (citation not found) are also
77+
// cached to avoid re-scanning the document on each hover.
78+
struct CitationCacheEntry {
79+
char* surname; // owned UTF-8
80+
int year;
81+
int srcPage; // page where the lookup was issued (so cap at srcPage works per-page)
82+
int destPage; // -1 if not found
83+
float destX;
84+
float destY;
85+
};
86+
87+
struct RefLookupCache {
88+
Vec<CitationCacheEntry> entries;
89+
};
90+
91+
static const CitationCacheEntry* CacheLookup(RefLookupCache* c, const char* surname, int year, int srcPage) {
92+
if (!c) {
93+
return nullptr;
94+
}
95+
for (size_t i = 0; i < c->entries.size(); i++) {
96+
const CitationCacheEntry& e = c->entries[i];
97+
if (e.year == year && e.srcPage == srcPage && str::Eq(e.surname, surname)) {
98+
return &e;
99+
}
100+
}
101+
return nullptr;
102+
}
103+
104+
static void CacheInsert(RefLookupCache* c, const char* surname, int year, int srcPage, int destPage, float destX,
105+
float destY) {
106+
if (!c) {
107+
return;
108+
}
109+
CitationCacheEntry e;
110+
e.surname = str::Dup(surname);
111+
e.year = year;
112+
e.srcPage = srcPage;
113+
e.destPage = destPage;
114+
e.destX = destX;
115+
e.destY = destY;
116+
c->entries.Append(e);
117+
}
118+
119+
static void CacheFree(RefLookupCache* c) {
120+
if (!c) {
121+
return;
122+
}
123+
for (size_t i = 0; i < c->entries.size(); i++) {
124+
str::Free(c->entries[i].surname);
125+
}
126+
delete c;
127+
}
128+
74129
// upper bound for the auto-fit base zoom. We render at min(kRenderZoom,
75130
// fit-to-popup-max), then multiply by RefHoverState::Displayed::userZoom
76131
// (the mouse-wheel adjustment).
@@ -212,6 +267,8 @@ void RefHoverDestroy(RefHoverState* s) {
212267
}
213268
delete s->bmp;
214269
s->bmp = nullptr;
270+
CacheFree(s->lookupCache);
271+
s->lookupCache = nullptr;
215272
delete s;
216273
}
217274

@@ -974,3 +1031,169 @@ void RefHoverOnTimer(RefHoverState* s, HWND hwndCanvas, EngineBase* engine, floa
9741031
req.destYRaw = s->pending.destY;
9751032
RefHoverRequestRender(s, engine, req);
9761033
}
1034+
1035+
// === Plain-text citation lookup ===
1036+
1037+
// Result of detecting a citation under the cursor.
1038+
struct DetectedCitation {
1039+
char* surname; // owned UTF-8 (caller frees), or nullptr
1040+
int year;
1041+
};
1042+
1043+
static void FreeDetectedCitation(DetectedCitation* c) {
1044+
str::Free(c->surname);
1045+
c->surname = nullptr;
1046+
}
1047+
1048+
// Detect a citation pattern under the cursor on srcPage. On success, returns
1049+
// true and fills *out with a freshly-allocated surname and year. The actual
1050+
// pattern matching is the pure DetectCitationInPageText (RefHoverDetect.cpp).
1051+
static bool DetectCitationAtCursor(EngineBase* engine, int srcPage, Point pagePos, DetectedCitation* out) {
1052+
out->surname = nullptr;
1053+
out->year = 0;
1054+
int textLen = 0;
1055+
Rect* coords = nullptr;
1056+
const WCHAR* text = engine->GetTextForPage(srcPage, &textLen, &coords);
1057+
return DetectCitationInPageText(text, coords, textLen, pagePos, &out->surname, &out->year);
1058+
}
1059+
1060+
// Walk pages from pageCount → srcPage looking for a bibliography entry that
1061+
// matches the surname + year. Returns true on hit.
1062+
static bool FindReferenceLocation(EngineBase* engine, int srcPage, const char* surname, int year, int* destPageOut,
1063+
float* destXOut, float* destYOut) {
1064+
if (!engine || !surname || !*surname) {
1065+
return false;
1066+
}
1067+
int pageCount = engine->PageCount();
1068+
if (pageCount <= 0 || srcPage < 1 || srcPage > pageCount) {
1069+
return false;
1070+
}
1071+
1072+
// Convert surname to wide string for engine text matching.
1073+
WCHAR* surnameW = ToWStr(surname);
1074+
if (!surnameW) {
1075+
return false;
1076+
}
1077+
int surnameLen = (int)str::Len(surnameW);
1078+
if (surnameLen < 2) {
1079+
free(surnameW);
1080+
return false;
1081+
}
1082+
1083+
bool found = false;
1084+
for (int p = pageCount; p >= srcPage; p--) {
1085+
int textLen = 0;
1086+
Rect* coords = nullptr;
1087+
const WCHAR* text = engine->GetTextForPage(p, &textLen, &coords);
1088+
float x = 0, y = 0;
1089+
if (FindSurnameInPageText(text, coords, textLen, surnameW, surnameLen, year, &x, &y)) {
1090+
*destPageOut = p;
1091+
*destXOut = x;
1092+
*destYOut = y;
1093+
found = true;
1094+
break;
1095+
}
1096+
}
1097+
free(surnameW);
1098+
return found;
1099+
}
1100+
1101+
// Look up `surname` in the cache; on miss, do a fresh document scan and
1102+
// insert the result (positive or negative). Returns true on positive hit.
1103+
static bool LookupOrSearch(RefHoverState* s, EngineBase* engine, int srcPage, const char* surname, int year,
1104+
int& destPageOut, float& destXOut, float& destYOut) {
1105+
const CitationCacheEntry* hit = CacheLookup(s->lookupCache, surname, year, srcPage);
1106+
if (hit) {
1107+
if (hit->destPage > 0) {
1108+
destPageOut = hit->destPage;
1109+
destXOut = hit->destX;
1110+
destYOut = hit->destY;
1111+
return true;
1112+
}
1113+
return false;
1114+
}
1115+
int destPage = -1;
1116+
float destX = -1.f, destY = -1.f;
1117+
if (FindReferenceLocation(engine, srcPage, surname, year, &destPage, &destX, &destY)) {
1118+
CacheInsert(s->lookupCache, surname, year, srcPage, destPage, destX, destY);
1119+
destPageOut = destPage;
1120+
destXOut = destX;
1121+
destYOut = destY;
1122+
return true;
1123+
}
1124+
CacheInsert(s->lookupCache, surname, year, srcPage, -1, 0.f, 0.f);
1125+
return false;
1126+
}
1127+
1128+
bool RefHoverTryPlainText(RefHoverState* s, EngineBase* engine, int srcPage, Point pagePos, int& destPageOut,
1129+
float& destXOut, float& destYOut) {
1130+
if (!s || !engine || srcPage <= 0) {
1131+
return false;
1132+
}
1133+
DetectedCitation cite{};
1134+
if (!DetectCitationAtCursor(engine, srcPage, pagePos, &cite)) {
1135+
return false;
1136+
}
1137+
1138+
if (!s->lookupCache) {
1139+
s->lookupCache = new RefLookupCache();
1140+
}
1141+
1142+
bool result = LookupOrSearch(s, engine, srcPage, cite.surname, cite.year, destPageOut, destXOut, destYOut);
1143+
1144+
// Fallback: if surname has multiple space-separated parts and the full
1145+
// form didn't match, try each part as a prefix in descending-length
1146+
// order. Two patterns this covers:
1147+
// 1. Bibliography lists the entry under just the last name
1148+
// ("Vrielink, Oude R. A." vs. detected "Oude Vrielink").
1149+
// 2. PDF text extraction split a single-word surname by dropping a
1150+
// glyph ("Bash b" for "Bashab") — the longest fragment ("Bash")
1151+
// prefix-matches the real surname in the bibliography.
1152+
if (!result && cite.surname && str::FindChar(cite.surname, ' ')) {
1153+
struct Part {
1154+
const char* s;
1155+
int len;
1156+
};
1157+
Part parts[8];
1158+
int nParts = 0;
1159+
const char* p = cite.surname;
1160+
while (*p && nParts < 8) {
1161+
while (*p == ' ') {
1162+
p++;
1163+
}
1164+
if (!*p) {
1165+
break;
1166+
}
1167+
const char* start = p;
1168+
while (*p && *p != ' ') {
1169+
p++;
1170+
}
1171+
int len = (int)(p - start);
1172+
if (len >= 2) {
1173+
parts[nParts].s = start;
1174+
parts[nParts].len = len;
1175+
nParts++;
1176+
}
1177+
}
1178+
// Sort parts by length descending (simple selection sort, n<=8).
1179+
for (int i = 0; i < nParts - 1; i++) {
1180+
for (int j = i + 1; j < nParts; j++) {
1181+
if (parts[j].len > parts[i].len) {
1182+
Part t = parts[i];
1183+
parts[i] = parts[j];
1184+
parts[j] = t;
1185+
}
1186+
}
1187+
}
1188+
for (int i = 0; i < nParts && !result; i++) {
1189+
char buf[64];
1190+
int n = parts[i].len < 63 ? parts[i].len : 63;
1191+
memcpy(buf, parts[i].s, n);
1192+
buf[n] = 0;
1193+
result = LookupOrSearch(s, engine, srcPage, buf, cite.year, destPageOut, destXOut, destYOut);
1194+
}
1195+
}
1196+
1197+
FreeDetectedCitation(&cite);
1198+
return result;
1199+
}

src/RefHover.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,16 @@
33

44
class EngineBase;
55
struct RenderedBitmap;
6+
struct RefLookupCache;
67

78
struct RefHoverState {
89
HWND hwndPopup = nullptr;
910
// currently shown rendered destination strip (owned)
1011
RenderedBitmap* bmp = nullptr;
1112

13+
// cache of plain-text citation lookups (lazy-init on first use)
14+
RefLookupCache* lookupCache = nullptr;
15+
1216
// Pending hover request: set by RefHoverSchedule, consumed by
1317
// RefHoverOnTimer when the hover-delay timer fires.
1418
struct Pending {
@@ -104,3 +108,11 @@ bool RefHoverWheelZoom(RefHoverState* s, EngineBase* engine, int wheelDelta);
104108
// (continuous scrolling). Popup window keeps its initial size; only the
105109
// rendered region's Y (and possibly page number) changes.
106110
bool RefHoverWheelScroll(RefHoverState* s, EngineBase* engine, int wheelDelta);
111+
112+
// Plain-text citation hover: when no link element is under the cursor, try
113+
// to detect a "(Surname et al., 2020)" / "Surname (2020)" pattern at pagePos
114+
// on srcPage, find the bibliography entry that matches, and return its
115+
// location. Returns true on success and fills destPage/destX/destY.
116+
// Lookups are cached on s.
117+
bool RefHoverTryPlainText(RefHoverState* s, EngineBase* engine, int srcPage, Point pagePos, int& destPageOut,
118+
float& destXOut, float& destYOut);

0 commit comments

Comments
 (0)