Skip to content

Commit 417545d

Browse files
severoCopilot
andauthored
add github source (#381)
* add github source * fetch branches * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * copilot comments * support /raw/ * fix test * assume the response has the correct type * add a TODO * remove concept of directory prefix, remove default branch in github source * explicitly filter out branch names with slash * use status text --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent cf4eede commit 417545d

File tree

10 files changed

+499
-20
lines changed

10 files changed

+499
-20
lines changed

src/components/App/App.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { useMemo } from 'react'
22
import { Config, ConfigProvider } from '../../hooks/useConfig.js'
3+
import { getGitHubSource } from '../../lib/sources/gitHubSource.js'
34
import { getHttpSource } from '../../lib/sources/httpSource.js'
45
import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js'
56
import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js'
@@ -12,6 +13,7 @@ export default function App() {
1213
const col = search.get('col') === null ? undefined : Number(search.get('col'))
1314

1415
const source = getHuggingFaceSource(sourceId) ??
16+
getGitHubSource(sourceId) ??
1517
getHttpSource(sourceId) ??
1618
getHyperparamSource(sourceId, { endpoint: location.origin })
1719

src/components/Folder/Folder.test.tsx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ describe('Folder Component', () => {
9595
sourceId: 'test-source',
9696
sourceParts: [{ text: 'test-source', sourceId: 'test-source' }],
9797
kind: 'directory',
98-
prefix: '',
9998
listFiles: () => Promise.resolve(mockFiles),
10099
}
101100
const { getByPlaceholderText, findByText, getByText, queryByText } = render(<Folder source={dirSource} />)
@@ -133,7 +132,6 @@ describe('Folder Component', () => {
133132
sourceId: 'test-source',
134133
sourceParts: [{ text: 'test-source', sourceId: 'test-source' }],
135134
kind: 'directory',
136-
prefix: '',
137135
listFiles: () => Promise.resolve(mockFiles),
138136
}
139137
const { getByPlaceholderText, findByText } = render(<Folder source={dirSource} />)
@@ -153,7 +151,6 @@ describe('Folder Component', () => {
153151
sourceId: 'test-source',
154152
sourceParts: [{ text: 'test-source', sourceId: 'test-source' }],
155153
kind: 'directory',
156-
prefix: '',
157154
listFiles: async () => {
158155
await fetch('something') // to ensure we wait for loading
159156
return []

src/components/Folder/Folder.tsx

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,15 @@ export default function Folder({ source }: FolderProps) {
6363
} else if (e.key === 'Enter') {
6464
// if there is only one result, view it
6565
if (filtered?.length === 1 && 0 in filtered) {
66-
const key = join(source.prefix, filtered[0].name)
67-
if (key.endsWith('/')) {
66+
const file = filtered[0]
67+
if (file.kind === 'directory') {
6868
// clear search because we're about to change folder
6969
if (searchRef.current) {
7070
searchRef.current.value = ''
7171
}
7272
setSearchQuery('')
7373
}
74-
location.href = `/files?key=${key}`
74+
location.href = routes?.getSourceRouteUrl?.({ sourceId: file.sourceId }) ?? `/files?key=${file.sourceId}`
7575
}
7676
} else if (e.key === 'ArrowDown') {
7777
// move focus to first list item
@@ -81,7 +81,7 @@ export default function Folder({ source }: FolderProps) {
8181
searchElement?.addEventListener('keyup', handleKeyup)
8282
// Clean up event listener
8383
return () => searchElement?.removeEventListener('keyup', handleKeyup)
84-
}, [filtered, source.prefix])
84+
}, [filtered, routes])
8585

8686
// Jump to search box if user types '/'
8787
useEffect(() => {
@@ -97,7 +97,7 @@ export default function Folder({ source }: FolderProps) {
9797
return () => { document.removeEventListener('keydown', handleKeydown) }
9898
}, [])
9999

100-
return <Layout error={error} title={source.prefix}>
100+
return <Layout error={error} title={source.sourceId}>
101101
<Breadcrumb source={source}>
102102
<input autoFocus className={cn(styles.search, customClass?.search)} placeholder='Search...' ref={searchRef} />
103103
<Dropdown className={styles.settings} label={gearIcon} align='right'>
@@ -114,7 +114,7 @@ export default function Folder({ source }: FolderProps) {
114114
<ul className={cn(styles.fileList, customClass?.fileList)} ref={listRef}>
115115
{filtered.map((file, index) =>
116116
<li key={index}>
117-
<a href={routes?.getSourceRouteUrl?.({ sourceId: file.sourceId }) ?? location.href}>
117+
<a href={routes?.getSourceRouteUrl?.({ sourceId: file.sourceId }) ?? `/files?key=${file.sourceId}`}>
118118
<span data-file-kind={file.kind}>
119119
{file.name}
120120
</span>
@@ -133,7 +133,3 @@ export default function Folder({ source }: FolderProps) {
133133
}
134134
</Layout>
135135
}
136-
137-
function join(prefix: string, file: string) {
138-
return prefix ? prefix + '/' + file : file
139-
}

src/lib/sources/gitHubSource.ts

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js'
2+
import { getFileName } from './utils.js'
3+
4+
interface BaseUrl {
5+
source: string
6+
origin: string
7+
repo: string
8+
}
9+
10+
interface RepoUrl extends BaseUrl {
11+
kind: 'repo'
12+
}
13+
14+
interface PathUrl extends BaseUrl {
15+
branch: string
16+
path: string
17+
}
18+
19+
interface DirectoryUrl extends PathUrl {
20+
kind: 'directory'
21+
action: 'tree'
22+
}
23+
24+
interface FileUrl extends PathUrl {
25+
kind: 'file'
26+
action?: 'blob' | 'raw' | 'raw/refs/heads'
27+
resolveUrl: string
28+
}
29+
30+
interface RawFileUrl extends PathUrl {
31+
kind: 'file'
32+
action: undefined
33+
resolveUrl: string
34+
}
35+
36+
type GHUrl = RepoUrl | DirectoryUrl | FileUrl | RawFileUrl
37+
38+
const baseUrl = 'https://github.com'
39+
const baseRawUrl = 'https://raw.githubusercontent.com'
40+
41+
function getSourceParts(url: GHUrl): SourcePart[] {
42+
if (url.kind === 'repo') {
43+
return [{
44+
sourceId: `${baseUrl}/${url.repo}`,
45+
text: `${baseUrl}/${url.repo}`,
46+
}]
47+
}
48+
49+
const sourceParts: SourcePart[] = [{
50+
sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/`,
51+
text: `${baseUrl}/${url.repo}/tree/${url.branch}/`,
52+
}]
53+
54+
const pathParts = url.path.split('/').filter(d => d.length > 0)
55+
const lastPart = pathParts.at(-1)
56+
if (lastPart) {
57+
for (const [i, part] of pathParts.slice(0, -1).entries()) {
58+
sourceParts.push({
59+
sourceId: `${baseUrl}/${url.repo}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`,
60+
text: part + '/',
61+
})
62+
}
63+
sourceParts.push({
64+
sourceId: `${baseUrl}/${url.repo}/${url.action === 'tree' ? 'tree/' : 'blob/'}${url.branch}${url.path}`,
65+
text: lastPart,
66+
})
67+
}
68+
return sourceParts
69+
}
70+
async function fetchFilesList(url: DirectoryUrl | RepoUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise<FileMetadata[]> {
71+
const path = url.kind === 'repo' ? '/' : url.path
72+
const branchParam = url.kind === 'repo' ? '' : `?ref=${url.branch}`
73+
const apiURL = `https://api.github.com/repos/${url.repo}/contents${path}${branchParam}`
74+
const headers = new Headers(options?.requestInit?.headers)
75+
headers.set('Accept', 'application/vnd.github+json')
76+
if (options?.accessToken) {
77+
headers.set('Authorization', `Bearer ${options.accessToken}`)
78+
}
79+
const response = await fetch(apiURL, {
80+
...options?.requestInit,
81+
method: 'GET',
82+
headers,
83+
})
84+
if (!response.ok) {
85+
throw new Error(`GitHub API error: ${response.status} ${response.statusText} - ${await response.text()}`)
86+
}
87+
try {
88+
const data = await response.json() as {html_url: string, path: string, type: 'file' | 'dir', size: number}[]
89+
return data.map((file) => ({
90+
name: getFileName(file.path),
91+
fileSize: file.size,
92+
sourceId: file.html_url,
93+
kind: file.type === 'file' ? 'file' : 'directory',
94+
}))
95+
} catch (error) {
96+
throw new Error(`Failed to parse GitHub API response: ${error instanceof Error ? error.message : String(error)}`)
97+
}
98+
}
99+
export function getGitHubSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined {
100+
try {
101+
const url = parseGitHubUrl(sourceId)
102+
const path = url.kind === 'repo' ? '/' : url.path
103+
async function fetchVersions() {
104+
const branches = await fetchBranchesList(url, options)
105+
return {
106+
label: 'Branches',
107+
versions: branches.filter(
108+
// TODO(SL): support branches with slashes in their names (feature/foo/bar)
109+
branch => !branch.includes('/')
110+
).map((branch) => {
111+
const branchSourceId = `${baseUrl}/${url.repo}/${url.kind === 'file' ? 'blob' : 'tree'}/${branch}${path}`
112+
return {
113+
label: branch,
114+
sourceId: branchSourceId,
115+
}
116+
}),
117+
}
118+
}
119+
if (url.kind === 'file') {
120+
return {
121+
kind: 'file',
122+
sourceId,
123+
sourceParts: getSourceParts(url),
124+
fileName: getFileName(url.path),
125+
resolveUrl: url.resolveUrl,
126+
requestInit: options?.requestInit,
127+
fetchVersions,
128+
}
129+
} else {
130+
return {
131+
kind: 'directory',
132+
sourceId,
133+
sourceParts: getSourceParts(url),
134+
listFiles: () => fetchFilesList(url, options),
135+
fetchVersions,
136+
}
137+
}
138+
} catch {
139+
return undefined
140+
}
141+
}
142+
143+
// TODO(SL): support branches with slashes in their names (feature/foo)
144+
export function parseGitHubUrl(url: string): GHUrl {
145+
const urlObject = new URL(url)
146+
// ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL
147+
148+
if (
149+
urlObject.protocol !== 'https:' ||
150+
![
151+
'github.co', 'github.com', 'www.github.com', 'raw.githubusercontent.com',
152+
].includes(urlObject.host)
153+
) {
154+
throw new Error('Not a GitHub URL')
155+
}
156+
157+
const { pathname } = urlObject
158+
159+
if (urlObject.host === 'raw.githubusercontent.com') {
160+
// https://raw.githubusercontent.com/apache/parquet-testing/refs/heads/master/variant/README.md
161+
const rawFileGroups =
162+
/^\/(?<owner>[^/]+)\/(?<repo>[^/]+)\/(?<action>(refs\/heads\/)?)(?<branch>[^/]+)(?<path>(\/[^/]+)+)$/.exec(
163+
pathname
164+
)?.groups
165+
if (
166+
rawFileGroups?.owner !== undefined &&
167+
rawFileGroups.repo !== undefined &&
168+
rawFileGroups.branch !== undefined &&
169+
rawFileGroups.path !== undefined
170+
) {
171+
const branch = rawFileGroups.branch.replace(/\//g, '%2F')
172+
const source = `${urlObject.origin}/${rawFileGroups.owner}/${rawFileGroups.repo}/${branch}${rawFileGroups.path}`
173+
return {
174+
kind: 'file',
175+
source,
176+
origin: urlObject.origin,
177+
repo: rawFileGroups.owner + '/' + rawFileGroups.repo,
178+
branch,
179+
path: rawFileGroups.path,
180+
resolveUrl: source,
181+
}
182+
} else {
183+
throw new Error('Unsupported GitHub URL')
184+
}
185+
}
186+
187+
const repoGroups = /^\/(?<owner>[^/]+)\/(?<repo>[^/]+)\/?$/.exec(
188+
pathname
189+
)?.groups
190+
if (repoGroups?.owner !== undefined && repoGroups.repo !== undefined) {
191+
return {
192+
kind: 'repo',
193+
source: url,
194+
origin: urlObject.origin,
195+
repo: repoGroups.owner + '/' + repoGroups.repo,
196+
}
197+
}
198+
199+
const folderGroups =
200+
/^\/(?<owner>[^/]+)\/(?<repo>[^/]+)\/(?<action>tree)\/(?<branch>[^/]+)(?<path>(\/[^/]+)*)\/?$/.exec(
201+
pathname
202+
)?.groups
203+
if (
204+
folderGroups?.owner !== undefined &&
205+
folderGroups.repo !== undefined &&
206+
folderGroups.action !== undefined &&
207+
folderGroups.branch !== undefined &&
208+
folderGroups.path !== undefined
209+
) {
210+
const branch = folderGroups.branch.replace(/\//g, '%2F')
211+
const source = `${urlObject.origin}/${folderGroups.owner}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}`
212+
return {
213+
kind: 'directory',
214+
source,
215+
origin: urlObject.origin,
216+
repo: folderGroups.owner + '/' + folderGroups.repo,
217+
action: 'tree',
218+
branch,
219+
path: folderGroups.path,
220+
}
221+
}
222+
223+
// https://github.com/apache/parquet-testing/blob/master/variant/README.md
224+
// https://github.com/apache/parquet-testing/raw/refs/heads/master/variant/README.md
225+
const fileGroups =
226+
/^\/(?<owner>[^/]+)\/(?<repo>[^/]+)\/(?<action>blob|raw|raw\/refs\/heads)\/(?<branch>[^/]+)(?<path>(\/[^/]+)+)$/.exec(
227+
pathname
228+
)?.groups
229+
if (
230+
fileGroups?.owner !== undefined &&
231+
fileGroups.repo !== undefined &&
232+
fileGroups.action !== undefined &&
233+
fileGroups.branch !== undefined &&
234+
fileGroups.path !== undefined
235+
) {
236+
const branch = fileGroups.branch.replace(/\//g, '%2F')
237+
const source = `${urlObject.origin}/${fileGroups.owner}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}`
238+
return {
239+
kind: 'file',
240+
source,
241+
origin: urlObject.origin,
242+
repo: fileGroups.owner + '/' + fileGroups.repo,
243+
action: fileGroups.action === 'blob' ? 'blob' : fileGroups.action === 'raw' ? 'raw' : 'raw/refs/heads',
244+
branch,
245+
path: fileGroups.path,
246+
resolveUrl: `${baseRawUrl}/${fileGroups.owner}/${fileGroups.repo}/${branch}${fileGroups.path}`,
247+
}
248+
}
249+
250+
throw new Error('Unsupported GitHub URL')
251+
}
252+
253+
/**
254+
* List branches in a GitHub dataset repo
255+
*
256+
* Example API URL: https://api.github.com/repos/owner/repo/branches
257+
*
258+
* @param repo (namespace/repo)
259+
* @param [options]
260+
* @param [options.requestInit] - request init object to pass to fetch
261+
* @param [options.accessToken] - access token to use for authentication
262+
*
263+
* @returns the list of branch names
264+
*/
265+
async function fetchBranchesList(
266+
url: GHUrl,
267+
options?: {requestInit?: RequestInit, accessToken?: string}
268+
): Promise<string[]> {
269+
const headers = new Headers(options?.requestInit?.headers)
270+
headers.set('accept', 'application/vnd.github+json')
271+
if (options?.accessToken) {
272+
headers.set('Authorization', `Bearer ${options.accessToken}`)
273+
}
274+
const response = await fetch(`https://api.github.com/repos/${url.repo}/branches`, { ...options?.requestInit, headers })
275+
if (!response.ok) {
276+
throw new Error(`HTTP error ${response.statusText} (${response.status})`)
277+
}
278+
const branches = await response.json() as {name: string}[]
279+
return branches.map(({ name }) => name)
280+
}

src/lib/sources/httpSource.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ export function getHttpSource(sourceId: string, options?: {requestInit?: Request
9898
kind: 'directory',
9999
sourceId,
100100
sourceParts,
101-
prefix,
102101
listFiles: () => s3list(bucket, prefix).then(items =>
103102
items
104103
// skip s3 directory placeholder

src/lib/sources/huggingFaceSource.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,6 @@ function getSourceParts(url: HFUrl): SourcePart[] {
7272
}
7373
return sourceParts
7474
}
75-
function getPrefix(url: DirectoryUrl): string {
76-
return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '')
77-
}
7875
async function fetchFilesList(url: DirectoryUrl, options?: { requestInit?: RequestInit, accessToken?: string }): Promise<FileMetadata[]> {
7976
const repoFullName = getFullName(url)
8077
const filesIterator = listFiles({
@@ -134,7 +131,6 @@ export function getHuggingFaceSource(sourceId: string, options?: {requestInit?:
134131
kind: 'directory',
135132
sourceId,
136133
sourceParts: getSourceParts(url),
137-
prefix: getPrefix(url),
138134
listFiles: () => fetchFilesList(url, options),
139135
fetchVersions,
140136
}

0 commit comments

Comments
 (0)