Skip to content

Commit bc9ae39

Browse files
feat(google-scholar): add cite and profile commands, fix search dedup (#1176)
* feat(google-scholar): add cite and profile commands, fix search dedup - cite: get BibTeX/EndNote/RefMan/RefWorks citation for a paper. Clicks the cite button in search results and fetches the citation content from Google's citation endpoint. - profile: view an author's Google Scholar profile (h-index, i10-index, citation count, top papers). Accepts author name or Scholar user ID. - search: fix duplicate results caused by CSS selector matching both outer container (.gs_r.gs_or.gs_scl) and inner child (.gs_ri) for each paper. Closes #1174, closes #1175 * fix(google-scholar): fail fast on cite and profile misses * fix(google-scholar): document new commands and lock dedup test --------- Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 02dbbb1 commit bc9ae39

11 files changed

Lines changed: 365 additions & 4 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ To load the source Browser Bridge extension:
238238
| **quark** | `ls` `mkdir` `mv` `rename` `rm` `save` `share-tree` |
239239
| **uiverse** | `code` `preview` |
240240
| **baidu-scholar** | `search` |
241-
| **google-scholar** | `search` |
241+
| **google-scholar** | `search` `cite` `profile` |
242242
| **gov-law** | `search` `recent` |
243243
| **gov-policy** | `search` `recent` |
244244
| **nowcoder** | `hot` `trending` `topics` `recommend` `creators` `companies` `jobs` `search` `suggest` `experience` `referral` `salary` `papers` `practice` `notifications` `detail` |

README.zh-CN.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ npm link
239239
| **uiverse** | `code` `preview` | 浏览器 |
240240
| **apple-podcasts** | `search` `episodes` `top` | 公开 |
241241
| **baidu-scholar** | `search` | 公开 |
242-
| **google-scholar** | `search` | 公开 |
242+
| **google-scholar** | `search` `cite` `profile` | 公开 |
243243
| **gov-law** | `search` `recent` | 公开 |
244244
| **gov-policy** | `search` `recent` | 公开 |
245245
| **nowcoder** | `hot` `trending` `topics` `recommend` `creators` `companies` `jobs` `search` `suggest` `experience` `referral` `salary` `papers` `practice` `notifications` `detail` | 公开 / 浏览器 |

cli-manifest.json

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7391,6 +7391,86 @@
73917391
"modulePath": "google/trends.js",
73927392
"sourceFile": "google/trends.js"
73937393
},
7394+
{
7395+
"site": "google-scholar",
7396+
"name": "cite",
7397+
"description": "Get citation for a Google Scholar paper",
7398+
"domain": "scholar.google.com",
7399+
"strategy": "public",
7400+
"browser": true,
7401+
"args": [
7402+
{
7403+
"name": "query",
7404+
"type": "str",
7405+
"required": true,
7406+
"positional": true,
7407+
"help": "Paper title to search for"
7408+
},
7409+
{
7410+
"name": "style",
7411+
"type": "str",
7412+
"default": "bibtex",
7413+
"required": false,
7414+
"help": "Citation format",
7415+
"choices": [
7416+
"bibtex",
7417+
"endnote",
7418+
"refman",
7419+
"refworks"
7420+
]
7421+
},
7422+
{
7423+
"name": "index",
7424+
"type": "int",
7425+
"default": 1,
7426+
"required": false,
7427+
"help": "Which search result to cite (1-based)"
7428+
}
7429+
],
7430+
"columns": [
7431+
"title",
7432+
"format",
7433+
"citation"
7434+
],
7435+
"type": "js",
7436+
"modulePath": "google-scholar/cite.js",
7437+
"sourceFile": "google-scholar/cite.js",
7438+
"navigateBefore": false
7439+
},
7440+
{
7441+
"site": "google-scholar",
7442+
"name": "profile",
7443+
"description": "View a Google Scholar author profile",
7444+
"domain": "scholar.google.com",
7445+
"strategy": "public",
7446+
"browser": true,
7447+
"args": [
7448+
{
7449+
"name": "author",
7450+
"type": "str",
7451+
"required": true,
7452+
"positional": true,
7453+
"help": "Author name or Scholar user ID (e.g. JicYPdAAAAAJ)"
7454+
},
7455+
{
7456+
"name": "limit",
7457+
"type": "int",
7458+
"default": 10,
7459+
"required": false,
7460+
"help": "Max papers to show (max 20)"
7461+
}
7462+
],
7463+
"columns": [
7464+
"rank",
7465+
"title",
7466+
"cited",
7467+
"year"
7468+
],
7469+
"type": "js",
7470+
"modulePath": "google-scholar/profile.js",
7471+
"sourceFile": "google-scholar/profile.js",
7472+
"navigateBefore": false
7473+
},
73947474
{
73957475
"site": "google-scholar",
73967476
"name": "search",

clis/google-scholar/cite.js

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { CommandExecutionError } from '@jackwener/opencli/errors';
3+
import { requireNonEmptyQuery } from '../_shared/common.js';
4+
5+
cli({
6+
site: 'google-scholar',
7+
name: 'cite',
8+
description: 'Get citation for a Google Scholar paper',
9+
domain: 'scholar.google.com',
10+
strategy: Strategy.PUBLIC,
11+
browser: true,
12+
args: [
13+
{ name: 'query', positional: true, required: true, help: 'Paper title to search for' },
14+
{ name: 'style', default: 'bibtex', choices: ['bibtex', 'endnote', 'refman', 'refworks'], help: 'Citation format' },
15+
{ name: 'index', type: 'int', default: 1, help: 'Which search result to cite (1-based)' },
16+
],
17+
columns: ['title', 'format', 'citation'],
18+
navigateBefore: false,
19+
func: async (page, kwargs) => {
20+
const query = requireNonEmptyQuery(kwargs.query);
21+
const format = kwargs.style || 'bibtex';
22+
const index = Math.max(1, kwargs.index || 1) - 1;
23+
24+
await page.goto(`https://scholar.google.com/scholar?q=${encodeURIComponent(query)}&hl=en`);
25+
await page.wait(3);
26+
27+
const clicked = await page.evaluate(`(() => {
28+
var cites = document.querySelectorAll('a.gs_or_cit');
29+
if (cites.length <= ${index}) return { ok: false, reason: 'result not found at index ${index + 1}' };
30+
var titleEl = document.querySelectorAll('.gs_r.gs_or.gs_scl')[${index}];
31+
var title = '';
32+
if (titleEl) {
33+
var t = titleEl.querySelector('.gs_rt a, h3 a');
34+
title = t ? t.textContent.trim() : '';
35+
}
36+
cites[${index}].click();
37+
return { ok: true, title: title };
38+
})()`);
39+
40+
if (!clicked?.ok) {
41+
throw new CommandExecutionError(clicked?.reason || `Could not find search result at index ${index + 1}`);
42+
}
43+
44+
await page.wait(2);
45+
46+
const formatMap = { bibtex: 'BibTeX', endnote: 'EndNote', refman: 'RefMan', refworks: 'RefWorks' };
47+
const formatLabel = formatMap[format] || 'BibTeX';
48+
49+
const citeUrl = await page.evaluate(`(() => {
50+
var links = document.querySelectorAll('#gs_cit a.gs_citi');
51+
for (var i = 0; i < links.length; i++) {
52+
if (links[i].textContent.trim() === '${formatLabel}') return links[i].href;
53+
}
54+
return null;
55+
})()`);
56+
57+
if (!citeUrl) {
58+
throw new CommandExecutionError(`Could not find ${formatLabel} citation link for result ${index + 1}`);
59+
}
60+
61+
await page.goto(citeUrl);
62+
await page.wait(2);
63+
64+
const citation = await page.evaluate(`(() => {
65+
return (document.body.innerText || '').trim();
66+
})()`);
67+
68+
if (!citation) {
69+
throw new CommandExecutionError(`${formatLabel} citation page returned an empty response`);
70+
}
71+
72+
return [{ title: clicked.title, format: format, citation }];
73+
},
74+
});

clis/google-scholar/cite.test.js

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
import { CommandExecutionError } from '@jackwener/opencli/errors';
3+
import { getRegistry } from '@jackwener/opencli/registry';
4+
import './cite.js';
5+
6+
describe('google-scholar cite command', () => {
7+
const command = getRegistry().get('google-scholar/cite');
8+
9+
it('registers as a public browser command', () => {
10+
expect(command).toBeDefined();
11+
expect(command.site).toBe('google-scholar');
12+
expect(command.strategy).toBe('public');
13+
expect(command.browser).toBe(true);
14+
});
15+
16+
it('rejects empty queries before browser navigation', async () => {
17+
const page = { goto: vi.fn() };
18+
await expect(command.func(page, { query: ' ' })).rejects.toMatchObject({
19+
name: 'ArgumentError',
20+
code: 'ARGUMENT',
21+
});
22+
expect(page.goto).not.toHaveBeenCalled();
23+
});
24+
25+
it('throws when the requested search result index does not exist', async () => {
26+
const page = {
27+
goto: vi.fn().mockResolvedValue(undefined),
28+
wait: vi.fn().mockResolvedValue(undefined),
29+
evaluate: vi.fn().mockResolvedValueOnce({ ok: false, reason: 'result not found at index 2' }),
30+
};
31+
await expect(command.func(page, { query: 'test', index: 2 })).rejects.toThrow(CommandExecutionError);
32+
});
33+
34+
it('looks up the requested citation style instead of only locking BibTeX', async () => {
35+
const page = {
36+
goto: vi.fn().mockResolvedValue(undefined),
37+
wait: vi.fn().mockResolvedValue(undefined),
38+
evaluate: vi.fn()
39+
.mockResolvedValueOnce({ ok: true, title: 'Paper Title' })
40+
.mockResolvedValueOnce('https://example.com/refworks')
41+
.mockResolvedValueOnce('RefWorks citation body'),
42+
};
43+
const result = await command.func(page, { query: 'test', style: 'refworks' });
44+
expect(result).toEqual([{ title: 'Paper Title', format: 'refworks', citation: 'RefWorks citation body' }]);
45+
expect(page.evaluate.mock.calls[1][0]).toContain('RefWorks');
46+
});
47+
});

clis/google-scholar/profile.js

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import { cli, Strategy } from '@jackwener/opencli/registry';
2+
import { CommandExecutionError } from '@jackwener/opencli/errors';
3+
import { clampInt, requireNonEmptyQuery } from '../_shared/common.js';
4+
5+
cli({
6+
site: 'google-scholar',
7+
name: 'profile',
8+
description: 'View a Google Scholar author profile',
9+
domain: 'scholar.google.com',
10+
strategy: Strategy.PUBLIC,
11+
browser: true,
12+
args: [
13+
{ name: 'author', positional: true, required: true, help: 'Author name or Scholar user ID (e.g. JicYPdAAAAAJ)' },
14+
{ name: 'limit', type: 'int', default: 10, help: 'Max papers to show (max 20)' },
15+
],
16+
columns: ['rank', 'title', 'cited', 'year'],
17+
navigateBefore: false,
18+
func: async (page, kwargs) => {
19+
const author = requireNonEmptyQuery(kwargs.author, 'author');
20+
const limit = clampInt(kwargs.limit, 10, 1, 20);
21+
22+
const isUserId = /^[A-Za-z0-9_-]{12}$/.test(author);
23+
if (isUserId) {
24+
await page.goto(`https://scholar.google.com/citations?user=${author}&hl=en&sortby=citedby`);
25+
} else {
26+
await page.goto(`https://scholar.google.com/citations?view_op=search_authors&mauthors=${encodeURIComponent(author)}&hl=en`);
27+
await page.wait(3);
28+
29+
const profileClicked = await page.evaluate(`(() => {
30+
var link = document.querySelector('.gs_ai_pho, .gsc_oai_photo, a[href*="citations?user="]');
31+
if (link) { link.click(); return true; }
32+
return false;
33+
})()`);
34+
35+
if (!profileClicked) {
36+
throw new CommandExecutionError(`No profile found for: ${author}`);
37+
}
38+
}
39+
40+
await page.wait(3);
41+
42+
const data = await page.evaluate(`(() => {
43+
var name = (document.querySelector('#gsc_prf_in') || {}).textContent || '';
44+
var affiliation = (document.querySelector('.gsc_prf_il') || {}).textContent || '';
45+
46+
var stats = document.querySelectorAll('#gsc_rsb_st td.gsc_rsb_std');
47+
var citations = stats[0] ? stats[0].textContent.trim() : '';
48+
var hIndex = stats[2] ? stats[2].textContent.trim() : '';
49+
var i10Index = stats[4] ? stats[4].textContent.trim() : '';
50+
51+
var papers = [];
52+
var rows = document.querySelectorAll('#gsc_a_b .gsc_a_tr');
53+
for (var i = 0; i < rows.length && i < ${limit}; i++) {
54+
var titleEl = rows[i].querySelector('.gsc_a_at');
55+
var citedEl = rows[i].querySelector('.gsc_a_ac');
56+
var yearEl = rows[i].querySelector('.gsc_a_y span');
57+
if (titleEl) papers.push({
58+
rank: i + 1,
59+
title: titleEl.textContent.trim(),
60+
cited: citedEl ? citedEl.textContent.trim() : '0',
61+
year: yearEl ? yearEl.textContent.trim() : '',
62+
});
63+
}
64+
65+
return {
66+
name: name.trim(),
67+
affiliation: affiliation.trim(),
68+
citations: citations,
69+
hIndex: hIndex,
70+
i10Index: i10Index,
71+
papers: papers,
72+
};
73+
})()`);
74+
75+
if (!data?.name) {
76+
throw new CommandExecutionError(`Could not load Google Scholar profile for: ${author}`);
77+
}
78+
79+
if (!data.papers || data.papers.length === 0) {
80+
throw new CommandExecutionError(`No papers found for: ${data.name || author}`);
81+
}
82+
83+
const summary = {
84+
rank: 0,
85+
title: data.name + (data.affiliation ? ' (' + data.affiliation + ')' : ''),
86+
cited: 'h=' + data.hIndex + ' i10=' + data.i10Index + ' total=' + data.citations,
87+
year: '-',
88+
};
89+
90+
return [summary, ...data.papers];
91+
},
92+
});
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
import { CommandExecutionError } from '@jackwener/opencli/errors';
3+
import { getRegistry } from '@jackwener/opencli/registry';
4+
import './profile.js';
5+
6+
describe('google-scholar profile command', () => {
7+
const command = getRegistry().get('google-scholar/profile');
8+
9+
it('registers as a public browser command', () => {
10+
expect(command).toBeDefined();
11+
expect(command.site).toBe('google-scholar');
12+
expect(command.strategy).toBe('public');
13+
expect(command.browser).toBe(true);
14+
});
15+
16+
it('rejects empty author before browser navigation', async () => {
17+
const page = { goto: vi.fn() };
18+
await expect(command.func(page, { author: ' ' })).rejects.toMatchObject({
19+
name: 'ArgumentError',
20+
code: 'ARGUMENT',
21+
});
22+
expect(page.goto).not.toHaveBeenCalled();
23+
});
24+
25+
it('throws when author search does not resolve to a profile', async () => {
26+
const page = {
27+
goto: vi.fn().mockResolvedValue(undefined),
28+
wait: vi.fn().mockResolvedValue(undefined),
29+
evaluate: vi.fn().mockResolvedValueOnce(false),
30+
};
31+
await expect(command.func(page, { author: 'missing author' })).rejects.toThrow(CommandExecutionError);
32+
});
33+
34+
it('throws when the loaded profile has no papers', async () => {
35+
const page = {
36+
goto: vi.fn().mockResolvedValue(undefined),
37+
wait: vi.fn().mockResolvedValue(undefined),
38+
evaluate: vi.fn().mockResolvedValueOnce({
39+
name: 'Author Name',
40+
affiliation: 'Org',
41+
citations: '0',
42+
hIndex: '0',
43+
i10Index: '0',
44+
papers: [],
45+
}),
46+
};
47+
await expect(command.func(page, { author: 'JicYPdAAAAAJ' })).rejects.toThrow(CommandExecutionError);
48+
});
49+
});

clis/google-scholar/search.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ cli({
2323
(() => {
2424
const normalize = v => (v || '').replace(/\\s+/g, ' ').trim();
2525
const results = [];
26-
for (const el of document.querySelectorAll('.gs_r.gs_or.gs_scl, .gs_ri')) {
26+
for (const el of document.querySelectorAll('.gs_r.gs_or.gs_scl')) {
2727
const container = el.querySelector('.gs_ri') || el;
2828
const titleEl = container.querySelector('.gs_rt a, h3 a');
2929
const title = normalize(titleEl?.textContent);

clis/google-scholar/search.test.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,19 @@ describe('google-scholar search command', () => {
2020
});
2121
expect(page.goto).not.toHaveBeenCalled();
2222
});
23+
24+
it('locks dedup to outer Scholar result cards while preserving inner content extraction', async () => {
25+
const page = {
26+
goto: vi.fn().mockResolvedValue(undefined),
27+
wait: vi.fn().mockResolvedValue(undefined),
28+
evaluate: vi.fn().mockResolvedValue([]),
29+
};
30+
31+
await command.func(page, { query: 'transformer' });
32+
33+
const script = page.evaluate.mock.calls[0][0];
34+
expect(script).toContain("document.querySelectorAll('.gs_r.gs_or.gs_scl')");
35+
expect(script).not.toContain(".gs_r.gs_or.gs_scl, .gs_ri");
36+
expect(script).toContain("const container = el.querySelector('.gs_ri') || el");
37+
});
2338
});

0 commit comments

Comments
 (0)