Skip to content

Commit df278da

Browse files
authored
chore: swap globby with tinyglobby (#1017)
Replaces `globby` with `tinyglobby` and the `ignore` package. Since `tinyglobby` lacks built-in gitignore support, gitignore processing is now handled by invoking `git ls-files --others --ignored --exclude-standard` to get ignored paths. When git is unavailable (non-git directory or git not installed), it falls back to manually reading all `.gitignore` files and applying each one scoped to its directory using the `ignore` package. ## Scope of Changes - Removed `globby`; added `tinyglobby` and `ignore` - Rewrote `getActorLocalFilePaths` to use `git ls-files` for gitignore rules, with an `ignore`-package fallback when git is unavailable - Updated `checkIfStorageIsEmpty` to use `glob` from `tinyglobby` - Added tests for the gitignore fallback path (no git available), covering various patterns (wildcards, negation, comments, empty) and nested `.gitignore` scoping across directories ## Behavior Changes None expected for projects inside a git repo. For non-git directories the fallback path now manually applies `.gitignore` rules, which may differ slightly from `globby`'s built-in handling in edge cases. ## Risks & Considerations - Minor behavioral differences between `tinyglobby` and `globby` in edge cases (symlinks, brace expansion) are possible but unlikely to surface in practice. - The `git ls-files` invocation adds a subprocess call; if git is slow or unavailable, the fallback is triggered automatically. ## Testing Added tests covering the fallback path (git mocked to throw): no `.gitignore`, various patterns (wildcards, negation, comments, empty), and nested `.gitignore` scoping across directories. ### Manual e2e test ```sh yarn tsx -e "import { getActorLocalFilePaths } from './src/lib/utils.js'; getActorLocalFilePaths('/path/to/actor').then(console.log)" ``` Closes #1007
1 parent e929d74 commit df278da

5 files changed

Lines changed: 295 additions & 45 deletions

File tree

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@
9393
"escape-string-regexp": "~5.0.0",
9494
"execa": "^9.5.2",
9595
"express": "~5.2.0",
96-
"globby": "~15.0.0",
9796
"handlebars": "~4.7.8",
97+
"ignore": "^5.3.2",
9898
"indent-string": "^5.0.0",
9999
"is-ci": "~4.1.0",
100100
"istextorbinary": "~9.5.0",
@@ -108,6 +108,7 @@
108108
"string-width": "^8.0.0",
109109
"strip-ansi": "^7.1.0",
110110
"tiged": "~2.12.7",
111+
"tinyglobby": "^0.2.15",
111112
"which": "^6.0.0",
112113
"widest-line": "^6.0.0",
113114
"wrap-ansi": "^10.0.0"

src/lib/utils.ts

Lines changed: 120 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import { execSync } from 'node:child_process';
12
import { createWriteStream, existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from 'node:fs';
23
import { mkdir, readFile } from 'node:fs/promises';
34
import type { IncomingMessage } from 'node:http';
45
import { get } from 'node:https';
56
import { homedir } from 'node:os';
6-
import { dirname, join } from 'node:path';
7+
import { dirname, join, relative } from 'node:path';
78
import process from 'node:process';
89
import { finished } from 'node:stream/promises';
910

@@ -15,12 +16,13 @@ import { type ActorRun, ApifyClient, type ApifyClientOptions, type Build } from
1516
import archiver from 'archiver';
1617
import { AxiosHeaders } from 'axios';
1718
import escapeStringRegexp from 'escape-string-regexp';
18-
import { globby } from 'globby';
19+
import ignoreModule, { type Ignore } from 'ignore';
1920
import { getEncoding } from 'istextorbinary';
2021
import { Mime } from 'mime';
2122
import otherMimes from 'mime/types/other.js';
2223
import standardMimes from 'mime/types/standard.js';
2324
import { gte, minVersion, satisfies } from 'semver';
25+
import { escapePath, glob } from 'tinyglobby';
2426

2527
import {
2628
ACTOR_ENV_VARS,
@@ -134,8 +136,7 @@ const getTokenWithAuthFileFallback = (existingToken?: string) => {
134136
return existingToken;
135137
};
136138

137-
// biome-ignore format: off
138-
type CJSAxiosHeaders = import('axios', { with: { 'resolution-mode': 'require' } }).AxiosRequestConfig['headers'];
139+
type CJSAxiosHeaders = import('axios', { with: { 'resolution-mode': 'require' }}).AxiosRequestConfig['headers'];
139140

140141
/**
141142
* Returns options for ApifyClient
@@ -231,9 +232,13 @@ export const setLocalEnv = async (actDir: string) => {
231232
if (gitignoreAdditions.length > 0) {
232233
if (gitignoreContents.length > 0) {
233234
gitignoreAdditions.unshift('# Added by Apify CLI');
234-
writeFileSync(gitignorePath, `\n${gitignoreAdditions.join('\n')}\n`, { flag: 'a' });
235+
writeFileSync(gitignorePath, `\n${gitignoreAdditions.join('\n')}\n`, {
236+
flag: 'a',
237+
});
235238
} else {
236-
writeFileSync(gitignorePath, `${gitignoreAdditions.join('\n')}\n`, { flag: 'w' });
239+
writeFileSync(gitignorePath, `${gitignoreAdditions.join('\n')}\n`, {
240+
flag: 'w',
241+
});
237242
}
238243
}
239244
};
@@ -285,18 +290,121 @@ export const createSourceFiles = async (paths: string[], cwd: string) => {
285290
});
286291
};
287292

293+
/**
294+
* Fallback for when git is unavailable: find all .gitignore files and build a filter
295+
* using the `ignore` package, scoped to each file's directory.
296+
* Also walks ancestor directories to pick up parent .gitignore files (e.g. monorepo root),
297+
* stopping at the first .git boundary found.
298+
*/
299+
const getGitignoreFallbackFilter = async (cwd: string): Promise<(paths: string[]) => string[]> => {
300+
const gitignoreFiles = await glob('**/.gitignore', {
301+
dot: true,
302+
cwd,
303+
ignore: ['.git/**'],
304+
expandDirectories: false,
305+
});
306+
307+
// `ignore` is a CJS package; TypeScript sees its default import as the module
308+
// object rather than the callable factory, so we cast through unknown.
309+
const makeIg = ignoreModule as unknown as () => Ignore;
310+
311+
const filters: { dir: string; ig: Ignore; ancestorPrefix?: string }[] = [];
312+
313+
for (const gitignoreFile of gitignoreFiles) {
314+
const gitignoreDir = dirname(gitignoreFile); // e.g. 'src' or '.'
315+
const content = await readFile(join(cwd, gitignoreFile), 'utf-8');
316+
filters.push({ dir: gitignoreDir === '.' ? '' : gitignoreDir, ig: makeIg().add(content) });
317+
}
318+
319+
// Walk ancestor directories to pick up parent .gitignore files (e.g. monorepo root).
320+
// Check for a .git boundary FIRST so we stop before processing the git root's own
321+
// .gitignore — that file is handled by `git ls-files` when git is available, and
322+
// avoids accidentally applying rules from an unrelated outer repository.
323+
let parentDir = dirname(cwd);
324+
while (parentDir !== dirname(parentDir)) {
325+
if (existsSync(join(parentDir, '.git'))) {
326+
break;
327+
}
328+
329+
const parentGitignorePath = join(parentDir, '.gitignore');
330+
if (existsSync(parentGitignorePath)) {
331+
try {
332+
const content = await readFile(parentGitignorePath, 'utf-8');
333+
// Paths passed to this filter are relative to cwd. To test them against
334+
// a .gitignore that lives above cwd we need to prepend the relative path
335+
// from the ancestor dir to cwd so the ignore patterns see the right scope.
336+
const ancestorPrefix = relative(parentDir, cwd);
337+
filters.push({ dir: '', ig: makeIg().add(content), ancestorPrefix });
338+
} catch {
339+
// Ignore read errors
340+
}
341+
}
342+
343+
parentDir = dirname(parentDir);
344+
}
345+
346+
if (filters.length === 0) {
347+
return (paths) => paths;
348+
}
349+
350+
return (paths) =>
351+
paths.filter((filePath) => {
352+
for (const { dir, ig, ancestorPrefix } of filters) {
353+
let relativePath: string | null;
354+
if (!dir) {
355+
relativePath = ancestorPrefix ? `${ancestorPrefix}/${filePath}` : filePath;
356+
} else if (filePath.startsWith(`${dir}/`)) {
357+
relativePath = filePath.slice(dir.length + 1);
358+
} else {
359+
relativePath = null;
360+
}
361+
if (relativePath !== null && ig.ignores(relativePath)) {
362+
return false;
363+
}
364+
}
365+
return true;
366+
});
367+
};
368+
288369
/**
289370
* Get Actor local files, omit files defined in .gitignore and .git folder
290371
* All dot files(.file) and folders(.folder/) are included.
291372
*/
292-
export const getActorLocalFilePaths = async (cwd?: string) =>
293-
globby(['*', '**/**'], {
294-
ignore: ['.git/**', 'apify_storage', 'node_modules', 'storage', 'crawlee_storage'],
295-
gitignore: true,
373+
export const getActorLocalFilePaths = async (cwd?: string) => {
374+
const resolvedCwd = cwd ?? process.cwd();
375+
376+
const ignore = ['.git/**', 'apify_storage', 'node_modules', 'storage', 'crawlee_storage'];
377+
378+
let fallbackFilter: ((paths: string[]) => string[]) | null = null;
379+
380+
// Use git ls-files to get gitignored paths — this correctly handles ancestor .gitignore files,
381+
// nested .gitignore files, .git/info/exclude, and global gitignore config
382+
try {
383+
const gitIgnored = execSync('git ls-files --others --ignored --exclude-standard --directory', {
384+
cwd: resolvedCwd,
385+
encoding: 'utf-8',
386+
stdio: ['ignore', 'pipe', 'ignore'],
387+
})
388+
.split('\n')
389+
.filter(Boolean)
390+
.map((p) => escapePath(p));
391+
392+
ignore.push(...gitIgnored);
393+
} catch {
394+
// git is unavailable or directory is not a git repo — fall back to parsing .gitignore files
395+
fallbackFilter = await getGitignoreFallbackFilter(resolvedCwd);
396+
}
397+
398+
const paths = await glob(['*', '**/**'], {
399+
ignore,
296400
dot: true,
297-
cwd,
401+
expandDirectories: false,
402+
cwd: resolvedCwd,
298403
});
299404

405+
return fallbackFilter ? fallbackFilter(paths) : paths;
406+
};
407+
300408
/**
301409
* Create zip file with all Actor files specified with pathsToZip
302410
*/
@@ -444,7 +552,7 @@ export const getNpmCmd = (): string => {
444552
* Returns true if apify storage is empty (expect INPUT.*)
445553
*/
446554
export const checkIfStorageIsEmpty = async () => {
447-
const filesWithoutInput = await globby([
555+
const filesWithoutInput = await glob([
448556
`${getLocalStorageDir()}/**`,
449557
// Omit INPUT.* file
450558
`!${getLocalKeyValueStorePath()}/${KEY_VALUE_STORE_KEYS.INPUT}.*`,
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import { mkdirSync, writeFileSync } from 'node:fs';
2+
import { join } from 'node:path';
3+
4+
import { ensureFolderExistsSync } from '../../../src/lib/files.js';
5+
import { getActorLocalFilePaths } from '../../../src/lib/utils.js';
6+
import { useTempPath } from '../../__setup__/hooks/useTempPath.js';
7+
8+
// Mock execSync to simulate git not being available.
9+
// vi.mock is hoisted before imports, so utils.ts gets the mocked version.
10+
vi.mock('node:child_process', async (importOriginal) => {
11+
const original = await importOriginal<typeof import('node:child_process')>();
12+
return {
13+
...original,
14+
execSync: () => {
15+
throw new Error('not a git repository');
16+
},
17+
};
18+
});
19+
20+
const TEST_DIR = 'gitignore-fallback-test-dir';
21+
const FOLDERS = ['src', 'src/utils'];
22+
const FOLDERS_TO_IGNORE = ['dist', 'src/generated'];
23+
const FILES = ['main.js', 'src/index.js', 'src/utils/helper.js'];
24+
const FILES_IN_IGNORED_DIR = ['dist/bundle.js', 'src/generated/types.js'];
25+
const FILES_TO_IGNORE = ['debug.log'];
26+
27+
describe('Utils - gitignore fallback (no git)', () => {
28+
const { tmpPath, joinPath, beforeAllCalls, afterAllCalls } = useTempPath(TEST_DIR, {
29+
create: true,
30+
remove: true,
31+
cwd: false,
32+
cwdParent: false,
33+
});
34+
35+
beforeAll(async () => {
36+
await beforeAllCalls();
37+
38+
// NOTE: No git init here — execSync is mocked to throw, triggering the fallback path.
39+
40+
FOLDERS.concat(FOLDERS_TO_IGNORE).forEach((folder) => {
41+
ensureFolderExistsSync(tmpPath, folder);
42+
});
43+
44+
FILES.concat(FILES_TO_IGNORE, FILES_IN_IGNORED_DIR).forEach((file) =>
45+
writeFileSync(joinPath(file), 'content', { flag: 'w' }),
46+
);
47+
48+
const toIgnore = FOLDERS_TO_IGNORE.concat(FILES_TO_IGNORE).join('\n');
49+
writeFileSync(joinPath('.gitignore'), toIgnore, { flag: 'w' });
50+
});
51+
52+
afterAll(async () => {
53+
await afterAllCalls();
54+
});
55+
56+
it('should exclude files listed in .gitignore when git is unavailable', async () => {
57+
const paths = await getActorLocalFilePaths(tmpPath);
58+
59+
FILES.forEach((file) => expect(paths).toContain(file));
60+
FILES_IN_IGNORED_DIR.concat(FILES_TO_IGNORE).forEach((file) => expect(paths).not.toContain(file));
61+
});
62+
});
63+
64+
const NESTED_TEST_DIR = 'gitignore-nested-test-dir';
65+
66+
describe('Utils - nested .gitignore scoping (no git)', () => {
67+
const { tmpPath, joinPath, beforeAllCalls, afterAllCalls } = useTempPath(NESTED_TEST_DIR, {
68+
create: true,
69+
remove: true,
70+
cwd: false,
71+
cwdParent: false,
72+
});
73+
74+
beforeAll(async () => {
75+
await beforeAllCalls();
76+
77+
// Create directory structure
78+
ensureFolderExistsSync(tmpPath, 'src');
79+
ensureFolderExistsSync(tmpPath, 'src/internal');
80+
81+
// Create files: one public, one that should be scoped-ignored by src/.gitignore
82+
writeFileSync(joinPath('src/public.js'), 'content', { flag: 'w' });
83+
writeFileSync(joinPath('src/internal/secret.js'), 'content', { flag: 'w' });
84+
85+
// Only a nested .gitignore — the root has no entry for src/internal
86+
writeFileSync(joinPath('src/.gitignore'), 'internal/', { flag: 'w' });
87+
});
88+
89+
afterAll(async () => {
90+
await afterAllCalls();
91+
});
92+
93+
it('should exclude files matched by a nested .gitignore scoped to its own directory', async () => {
94+
const paths = await getActorLocalFilePaths(tmpPath);
95+
96+
// src/public.js should be present
97+
expect(paths).toContain('src/public.js');
98+
99+
// src/internal/secret.js should be excluded by src/.gitignore's `internal/` rule
100+
expect(paths).not.toContain('src/internal/secret.js');
101+
});
102+
});
103+
104+
const PARENT_TEST_DIR = 'gitignore-parent-test-dir';
105+
106+
describe('Utils - parent .gitignore applied to subproject (no git)', () => {
107+
// tmpPath is the "project root" that holds the parent .gitignore.
108+
// The actual cwd passed to getActorLocalFilePaths is tmpPath/subproject/.
109+
const { tmpPath, beforeAllCalls, afterAllCalls } = useTempPath(PARENT_TEST_DIR, {
110+
create: true,
111+
remove: true,
112+
cwd: false,
113+
cwdParent: false,
114+
});
115+
116+
let subprojectPath: string;
117+
118+
beforeAll(async () => {
119+
await beforeAllCalls();
120+
121+
subprojectPath = join(tmpPath, 'subproject');
122+
123+
// Parent .gitignore — rules that should apply to everything inside subproject/.
124+
// No fake .git is needed: the ancestor-walker already stops at the apify-cli
125+
// repo root (.git lives there) before touching its own .gitignore.
126+
writeFileSync(join(tmpPath, '.gitignore'), '*.secret\nbuild/\n', { flag: 'w' });
127+
128+
// Subproject directory structure
129+
mkdirSync(subprojectPath, { recursive: true });
130+
ensureFolderExistsSync(subprojectPath, 'src');
131+
ensureFolderExistsSync(subprojectPath, 'build');
132+
133+
// Files that should be kept
134+
writeFileSync(join(subprojectPath, 'main.js'), 'content', { flag: 'w' });
135+
writeFileSync(join(subprojectPath, 'src', 'utils.js'), 'content', { flag: 'w' });
136+
137+
// Files/dirs that should be excluded by parent .gitignore
138+
writeFileSync(join(subprojectPath, 'config.secret'), 'content', { flag: 'w' });
139+
writeFileSync(join(subprojectPath, 'src', 'db.secret'), 'content', { flag: 'w' });
140+
writeFileSync(join(subprojectPath, 'build', 'output.js'), 'content', { flag: 'w' });
141+
});
142+
143+
afterAll(async () => {
144+
await afterAllCalls();
145+
});
146+
147+
it('should exclude files matched by *.secret pattern in parent .gitignore', async () => {
148+
const paths = await getActorLocalFilePaths(subprojectPath);
149+
150+
expect(paths).toContain('main.js');
151+
expect(paths).toContain('src/utils.js');
152+
153+
expect(paths).not.toContain('config.secret');
154+
expect(paths).not.toContain('src/db.secret');
155+
});
156+
157+
it('should exclude directory matched by build/ pattern in parent .gitignore', async () => {
158+
const paths = await getActorLocalFilePaths(subprojectPath);
159+
160+
expect(paths).not.toContain('build/output.js');
161+
});
162+
});

test/local/lib/utils.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,18 @@ describe('Utils', () => {
3535
beforeAll(async () => {
3636
await beforeAllCalls();
3737

38+
// Initialize a fresh git repo so the local .gitignore is parsed independently
39+
// from the parent repo (which gitignores test/tmp entirely)
40+
await execWithLog({ cmd: 'git', args: ['init'], opts: { cwd: tmpPath } });
41+
3842
FOLDERS.concat(FOLDERS_TO_IGNORE).forEach((folder) => {
3943
ensureFolderExistsSync(tmpPath, folder);
4044
});
4145

4246
FILES.concat(FILES_TO_IGNORE, FILES_IN_IGNORED_DIR).forEach((file) =>
43-
writeFileSync(joinPath(file), Math.random().toString(36).substring(7), { flag: 'w' }),
47+
writeFileSync(joinPath(file), Math.random().toString(36).substring(7), {
48+
flag: 'w',
49+
}),
4450
);
4551

4652
const toIgnore = FOLDERS_TO_IGNORE.concat(FILES_TO_IGNORE).join('\n');

0 commit comments

Comments
 (0)