Skip to content
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet"
},
"dependencies": {
"@huggingface/hub": "2.6.12",
"hightable": "0.20.2",
"hyparquet": "1.20.0",
"hyparquet-compressors": "1.1.1",
Expand Down
5 changes: 4 additions & 1 deletion src/components/App/App.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { useMemo } from 'react'
import { Config, ConfigProvider } from '../../hooks/useConfig.js'
import { getHttpSource } from '../../lib/sources/httpSource.js'
import { getHuggingFaceSource } from '../../lib/sources/huggingFaceSource.js'
import { getHyperparamSource } from '../../lib/sources/hyperparamSource.js'
import Page from '../Page/Page.js'

Expand All @@ -10,7 +11,9 @@ export default function App() {
const row = search.get('row') === null ? undefined : Number(search.get('row'))
const col = search.get('col') === null ? undefined : Number(search.get('col'))

const source = getHttpSource(sourceId) ?? getHyperparamSource(sourceId, { endpoint: location.origin })
const source = getHuggingFaceSource(sourceId) ??
getHttpSource(sourceId) ??
getHyperparamSource(sourceId, { endpoint: location.origin })

// Memoize the config to avoid creating a new object on each render
const config: Config = useMemo(() => ({
Expand Down
9 changes: 8 additions & 1 deletion src/components/Breadcrumb/Breadcrumb.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
min-height: 32px;
border-bottom: 1px solid #ddd;
background: var(--color-background-dark);
padding: 0 10px 0 20px;
padding: 0 20px;
border-radius: var(--border-radius-lg);
margin: var(--space-3xs);
/* TODO(SL): forbid overflow? */
Expand Down Expand Up @@ -56,6 +56,13 @@

.versions {
padding-left: 4px;
& > button {
color: #eee;

&:hover, &:focus {
color: #fff
}
}

[aria-current] {
font-weight: bold;
Expand Down
288 changes: 288 additions & 0 deletions src/lib/sources/huggingFaceSource.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
import { type RepoFullName, type RepoType, listFiles } from '@huggingface/hub'
import type { DirSource, FileMetadata, FileSource, SourcePart } from './types.js'
import { getFileName } from './utils.js'

export const baseUrl = 'https://huggingface.co'

function getFullName(url: HFUrl): RepoFullName {
return url.type === 'dataset' ? `datasets/${url.repo}` : url.type === 'space' ? `spaces/${url.repo}` : url.repo
}
function getSourceParts(url: HFUrl): SourcePart[] {
const fullName = getFullName(url)
const sourceParts: SourcePart[] = [{
sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/`,
text: `${baseUrl}/${fullName}/${url.action}/${url.branch}/`,
}]

const pathParts = url.path.split('/').filter(d => d.length > 0)
const lastPart = pathParts.at(-1)
if (lastPart) {
for (const [i, part] of pathParts.slice(0, -1).entries()) {
sourceParts.push({
sourceId: `${baseUrl}/${fullName}/tree/${url.branch}/${pathParts.slice(0, i + 1).join('/')}`,
text: part + '/',
})
}
sourceParts.push({
sourceId: `${baseUrl}/${fullName}/${url.action}/${url.branch}${url.path}`,
text: lastPart,
})
}
return sourceParts
}
function getPrefix(url: DirectoryUrl): string {
return `${url.origin}/${getFullName(url)}/tree/${url.branch}${url.path}`.replace(/\/$/, '')
}
async function fetchFilesList(url: DirectoryUrl, options?: {requestInit?: RequestInit, accessToken?: string}): Promise<FileMetadata[]> {
const filesIterator = listFiles({
repo: {
name: url.repo,
type: url.type,
},
revision: url.branch,
path: 'path' in url ? url.path.replace(/^\//, '') : '', // remove leading slash if any
expand: true,
accessToken: options?.accessToken,
})
const files: FileMetadata[] = []
for await (const file of filesIterator) {
files.push({
name: getFileName(file.path),
eTag: file.lastCommit?.id,
size: file.size,
lastModified: file.lastCommit?.date,
sourceId: `${url.origin}/${getFullName(url)}/${file.type === 'file' ? 'blob' : 'tree'}/${url.branch}/${file.path}`.replace(/\/$/, ''),
kind: file.type === 'file' ? 'file' : 'directory', // 'unknown' is considered as a directory
})
}
return files
}
export function getHuggingFaceSource(sourceId: string, options?: {requestInit?: RequestInit, accessToken?: string}): FileSource | DirSource | undefined {
try {
const url = parseHuggingFaceUrl(sourceId)
async function fetchVersions() {
const refsList = await fetchRefsList(url, options)
return {
label: 'Branches',
versions: refsList.map(({ refType, name, ref }) => {
const label = refType === 'branches' ? name :
refType === 'converts' ? `[convert] ${name}` :
refType === 'tags' ? `[tag] ${name}` :
`[pr] ${name}`
// remove refs/heads/ from the ref name
// e.g. refs/heads/main -> main
const fixedRef = refType === 'branches' ? ref.replace(/refs\/heads\//, '') : ref
const branchSourceId = `${url.origin}/${getFullName(url)}/${url.kind === 'file' ? 'blob' : 'tree'}/${fixedRef}${url.path}`
return {
label,
sourceId: branchSourceId,
}
}),
}
}
if (url.kind === 'file') {
return {
kind: 'file',
sourceId,
sourceParts: getSourceParts(url),
fileName: getFileName(url.path),
resolveUrl: url.resolveUrl,
requestInit: options?.requestInit,
fetchVersions,
}
} else {
return {
kind: 'directory',
sourceId,
sourceParts: getSourceParts(url),
prefix: getPrefix(url),
listFiles: () => fetchFilesList(url, options),
fetchVersions,
}
}
} catch {
return undefined
}
}

interface BaseUrl {
source: string
origin: string
type: RepoType
repo: string
branch: string
path: string
}

export interface DirectoryUrl extends BaseUrl {
kind: 'directory'
action: 'tree'
}

export interface FileUrl extends BaseUrl {
kind: 'file'
action: 'resolve' | 'blob'
resolveUrl: string
}

type HFUrl = DirectoryUrl | FileUrl;

export function parseHuggingFaceUrl(url: string): HFUrl {
const urlObject = new URL(url)
// ^ throws 'TypeError: URL constructor: {url} is not a valid URL.' if url is not a valid URL

if (
urlObject.protocol !== 'https:' ||
![
'huggingface.co', 'huggingface.com', 'hf.co',
// hf.com is not a HF domain
].includes(urlObject.host)
) {
throw new Error('Not a Hugging Face URL')
}

let { pathname } = urlObject
let type: RepoType = 'model'
if (pathname.startsWith('/datasets')) {
type = 'dataset'
pathname = pathname.slice('/datasets'.length)
} else if (pathname.startsWith('/spaces')) {
type = 'space'
pathname = pathname.slice('/spaces'.length)
}

const repoGroups = /^\/(?<namespace>[^/]+)\/(?<repo>[^/]+)\/?$/.exec(
pathname
)?.groups
if (repoGroups?.namespace !== undefined && repoGroups.repo !== undefined) {
return {
kind: 'directory',
source: url,
origin: urlObject.origin,
type,
repo: repoGroups.namespace + '/' + repoGroups.repo,
action: 'tree',
branch: 'main', // hardcode the default branch
path: '',
}
}

const folderGroups =
/^\/(?<namespace>[^/]+)\/(?<repo>[^/]+)\/(?<action>tree)\/(?<branch>(refs\/(convert|pr)\/)?[^/]+)(?<path>(\/[^/]+)*)\/?$/.exec(
pathname
)?.groups
if (
folderGroups?.namespace !== undefined &&
folderGroups.repo !== undefined &&
folderGroups.action !== undefined &&
folderGroups.branch !== undefined &&
folderGroups.path !== undefined &&
folderGroups.branch !== 'refs'
) {
const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : ''
const branch = folderGroups.branch.replace(/\//g, '%2F')
const source = `${urlObject.origin}${typePath}/${folderGroups.namespace}/${folderGroups.repo}/${folderGroups.action}/${branch}${folderGroups.path}`
return {
kind: 'directory',
source,
origin: urlObject.origin,
type,
repo: folderGroups.namespace + '/' + folderGroups.repo,
action: 'tree',
branch,
path: folderGroups.path,
}
}

const fileGroups =
/^\/(?<namespace>[^/]+)\/(?<repo>[^/]+)\/(?<action>blob|resolve)\/(?<branch>(refs\/(convert|pr)\/)?[^/]+)(?<path>(\/[^/]+)+)$/.exec(
pathname
)?.groups
if (
fileGroups?.namespace !== undefined &&
fileGroups.repo !== undefined &&
fileGroups.action !== undefined &&
fileGroups.branch !== undefined &&
fileGroups.path !== undefined &&
fileGroups.branch !== 'refs'
) {
const typePath = type === 'dataset' ? '/datasets' : type === 'space' ? '/spaces' : ''
const branch = fileGroups.branch.replace(/\//g, '%2F')
const source = `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/${fileGroups.action}/${branch}${fileGroups.path}`
return {
kind: 'file',
source,
origin: urlObject.origin,
type,
repo: fileGroups.namespace + '/' + fileGroups.repo,
action: fileGroups.action === 'blob' ? 'blob' : 'resolve',
branch,
path: fileGroups.path,
resolveUrl: `${urlObject.origin}${typePath}/${fileGroups.namespace}/${fileGroups.repo}/resolve/${branch}${fileGroups.path}`,
}
}

throw new Error('Unsupported Hugging Face URL')
}

interface RefResponse {
name: string;
ref: string;
targetCommit: string;
}

export const refTypes = [
'branches',
'tags',
'converts',
'pullRequests',
] as const
type RefType = (typeof refTypes)[number];
type RefsResponse = Partial<Record<RefType, RefResponse[]>>;

export interface RefMetadata extends RefResponse {
refType: RefType; // TODO(SL): use it to style the refs differently?
}

/**
* List refs in a HF dataset repo
*
* Example API URL: https://huggingface.co/api/datasets/codeparrot/github-code/refs
*
* @param repo (namespace/repo)
* @param [options]
* @param [options.requestInit] - request init object to pass to fetch
* @param [options.accessToken] - access token to use for authentication
*
* @returns the list of branches, tags, pull requests, and converts
*/
export async function fetchRefsList(
url: HFUrl,
options?: {requestInit?: RequestInit, accessToken?: string}
): Promise<RefMetadata[]> {
if (options?.accessToken && !options.accessToken.startsWith('hf_')) {
throw new TypeError('Your access token must start with \'hf_\'')
}
const headers = new Headers(options?.requestInit?.headers)
headers.set('accept', 'application/json')
if (options?.accessToken) {
headers.set('Authorization', `Bearer ${options.accessToken}`)
}
const response = await fetch(`https://huggingface.co/api/${url.type}s/${url.repo}/refs`, { ...options?.requestInit, headers })
if (!response.ok) {
throw new Error(`HTTP error ${response.status.toString()}`)
}
const refsByType = await response.json() as RefsResponse
return refTypes.flatMap((refType) => {
const refResponse = refsByType[refType]
if (!refResponse) {
return []
}
return refResponse.map((refResponse) => {
return {
refType,
...refResponse,
}
})
})
}
1 change: 1 addition & 0 deletions src/lib/sources/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
export { getHttpSource } from './httpSource.js'
export { getHyperparamSource } from './hyperparamSource.js'
export { getHuggingFaceSource } from './huggingFaceSource.js'
export type { HyperparamFileMetadata } from './hyperparamSource.js'
export type { DirSource, FileKind, FileMetadata, FileSource, Source, SourcePart } from './types.js'
export { getFileName } from './utils.js'
Loading