Skip to content

Commit dfdfb57

Browse files
committed
GitHub API fallback
1 parent 833f604 commit dfdfb57

File tree

1 file changed

+211
-7
lines changed

1 file changed

+211
-7
lines changed

pkg/github/github.go

Lines changed: 211 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ package github
44
import (
55
"context"
66
"encoding/json"
7+
"errors"
78
"fmt"
9+
"io"
810
"log/slog"
911
"net/http"
12+
"os"
1013
"regexp"
1114
"strconv"
1215
"strings"
@@ -62,6 +65,7 @@ type Client struct {
6265
httpClient *http.Client
6366
cache cache.HTTPCache
6467
logger *slog.Logger
68+
token string
6569
}
6670

6771
// Option configures a Client.
@@ -70,6 +74,7 @@ type Option func(*config)
7074
type config struct {
7175
cache cache.HTTPCache
7276
logger *slog.Logger
77+
token string
7378
}
7479

7580
// WithHTTPCache sets the HTTP cache.
@@ -82,17 +87,41 @@ func WithLogger(logger *slog.Logger) Option {
8287
return func(c *config) { c.logger = logger }
8388
}
8489

90+
// WithToken sets the GitHub API token.
91+
func WithToken(token string) Option {
92+
return func(c *config) { c.token = token }
93+
}
94+
8595
// New creates a GitHub client.
8696
func New(ctx context.Context, opts ...Option) (*Client, error) {
8797
cfg := &config{logger: slog.Default()}
8898
for _, opt := range opts {
8999
opt(cfg)
90100
}
91101

102+
// Ensure logger is not nil
103+
logger := cfg.logger
104+
if logger == nil {
105+
logger = slog.Default()
106+
}
107+
108+
// Try to get token from environment if not provided
109+
token := cfg.token
110+
if token == "" {
111+
token = os.Getenv("GITHUB_TOKEN")
112+
}
113+
114+
if token == "" {
115+
logger.WarnContext(ctx, "GITHUB_TOKEN not set - GitHub API requests will be rate-limited to 60/hour")
116+
} else {
117+
logger.InfoContext(ctx, "using GITHUB_TOKEN for authenticated API requests")
118+
}
119+
92120
return &Client{
93121
httpClient: &http.Client{Timeout: 3 * time.Second},
94122
cache: cfg.cache,
95-
logger: cfg.logger,
123+
logger: logger,
124+
token: token,
96125
}, nil
97126
}
98127

@@ -110,14 +139,43 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
110139

111140
c.logger.InfoContext(ctx, "fetching github profile", "url", urlStr, "username", username)
112141

113-
// Fetch API data
114-
prof, err := c.fetchAPI(ctx, urlStr, username)
115-
if err != nil {
116-
return nil, err
117-
}
142+
// Fetch API data, with fallback to HTML scraping on failure
143+
prof, apiErr := c.fetchAPI(ctx, urlStr, username)
118144

119145
// Fetch HTML to extract rel="me" links, README, and organizations
120146
htmlContent, htmlLinks := c.fetchHTML(ctx, urlStr)
147+
148+
// If API failed, try to build profile from HTML
149+
if apiErr != nil {
150+
var gitHubAPIErr *APIError
151+
if errors.As(apiErr, &gitHubAPIErr) {
152+
if gitHubAPIErr.IsRateLimit {
153+
c.logger.WarnContext(ctx, "GitHub API rate limited, falling back to HTML scraping",
154+
"url", urlStr,
155+
"reset_time", gitHubAPIErr.RateLimitReset.Format(time.RFC3339),
156+
)
157+
} else {
158+
c.logger.WarnContext(ctx, "GitHub API access denied, falling back to HTML scraping",
159+
"url", urlStr,
160+
"status", gitHubAPIErr.StatusCode,
161+
)
162+
}
163+
} else {
164+
c.logger.WarnContext(ctx, "GitHub API request failed, falling back to HTML scraping",
165+
"url", urlStr,
166+
"error", apiErr,
167+
)
168+
}
169+
170+
// Try to build profile from HTML
171+
if htmlContent == "" {
172+
return nil, fmt.Errorf("API failed and no HTML content available: %w", apiErr)
173+
}
174+
175+
prof = c.parseProfileFromHTML(ctx, htmlContent, urlStr, username)
176+
c.logger.InfoContext(ctx, "built profile from HTML scraping", "url", urlStr, "username", username)
177+
}
178+
121179
prof.SocialLinks = append(prof.SocialLinks, htmlLinks...)
122180

123181
// Extract README and organizations from HTML if available
@@ -147,6 +205,24 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
147205
return prof, nil
148206
}
149207

208+
// APIError contains details about a GitHub API error.
209+
//
210+
//nolint:govet // fieldalignment: intentional layout for readability
211+
type APIError struct {
212+
StatusCode int
213+
RateLimitRemain int
214+
RateLimitReset time.Time
215+
Message string
216+
IsRateLimit bool
217+
}
218+
219+
func (e *APIError) Error() string {
220+
if e.IsRateLimit {
221+
return fmt.Sprintf("GitHub API rate limited (resets at %s): %s", e.RateLimitReset.Format(time.RFC3339), e.Message)
222+
}
223+
return fmt.Sprintf("GitHub API error %d: %s", e.StatusCode, e.Message)
224+
}
225+
150226
func (c *Client) fetchAPI(ctx context.Context, urlStr, username string) (*profile.Profile, error) {
151227
apiURL := "https://api.github.com/users/" + username
152228

@@ -157,14 +233,86 @@ func (c *Client) fetchAPI(ctx context.Context, urlStr, username string) (*profil
157233
req.Header.Set("Accept", "application/vnd.github.v3+json")
158234
req.Header.Set("User-Agent", "sociopath/1.0")
159235

160-
body, err := cache.FetchURL(ctx, c.cache, c.httpClient, req, c.logger)
236+
if c.token != "" {
237+
req.Header.Set("Authorization", "Bearer "+c.token)
238+
}
239+
240+
body, err := c.doAPIRequest(ctx, req)
161241
if err != nil {
162242
return nil, err
163243
}
164244

165245
return parseJSON(body, urlStr, username)
166246
}
167247

248+
func (c *Client) doAPIRequest(ctx context.Context, req *http.Request) ([]byte, error) {
249+
// Check cache first
250+
cacheKey := req.URL.String()
251+
if c.cache != nil {
252+
if data, _, _, found := c.cache.Get(ctx, cacheKey); found {
253+
c.cache.RecordHit()
254+
if s := string(data); strings.HasPrefix(s, "ERROR:") {
255+
code, _ := strconv.Atoi(strings.TrimPrefix(s, "ERROR:")) //nolint:errcheck // parse error defaults to 0 which is acceptable
256+
c.logger.DebugContext(ctx, "cache hit (error)", "key", cacheKey, "status", code)
257+
return nil, &APIError{StatusCode: code, Message: "cached error"}
258+
}
259+
c.logger.DebugContext(ctx, "cache hit", "key", cacheKey)
260+
return data, nil
261+
}
262+
c.cache.RecordMiss()
263+
c.logger.InfoContext(ctx, "cache miss", "url", req.URL.String())
264+
} else {
265+
c.logger.InfoContext(ctx, "cache disabled", "url", req.URL.String())
266+
}
267+
268+
resp, err := c.httpClient.Do(req)
269+
if err != nil {
270+
return nil, err
271+
}
272+
defer func() { _ = resp.Body.Close() }() //nolint:errcheck // error ignored intentionally
273+
274+
// Parse rate limit headers (GitHub uses non-canonical casing, parse errors default to 0)
275+
rateLimitRemain, _ := strconv.Atoi(resp.Header.Get("X-RateLimit-Remaining")) //nolint:errcheck,canonicalheader // ok
276+
rateLimitReset, _ := strconv.ParseInt(resp.Header.Get("X-RateLimit-Reset"), 10, 64) //nolint:errcheck,canonicalheader // ok
277+
resetTime := time.Unix(rateLimitReset, 0)
278+
279+
if resp.StatusCode != http.StatusOK {
280+
body, _ := io.ReadAll(resp.Body) //nolint:errcheck // best effort read of error body
281+
isRateLimit := resp.StatusCode == http.StatusForbidden && rateLimitRemain == 0
282+
283+
apiErr := &APIError{
284+
StatusCode: resp.StatusCode,
285+
RateLimitRemain: rateLimitRemain,
286+
RateLimitReset: resetTime,
287+
Message: string(body),
288+
IsRateLimit: isRateLimit,
289+
}
290+
291+
c.logger.WarnContext(ctx, "GitHub API request failed",
292+
"url", req.URL.String(),
293+
"status", resp.StatusCode,
294+
"rate_limit_remaining", rateLimitRemain,
295+
"rate_limit_reset", resetTime.Format(time.RFC3339),
296+
"is_rate_limit", isRateLimit,
297+
"response_body", string(body),
298+
)
299+
300+
return nil, apiErr
301+
}
302+
303+
body, err := io.ReadAll(resp.Body)
304+
if err != nil {
305+
return nil, err
306+
}
307+
308+
// Cache successful response
309+
if c.cache != nil {
310+
_ = c.cache.SetAsync(ctx, cacheKey, body, "", nil) //nolint:errcheck // async write errors are non-fatal
311+
}
312+
313+
return body, nil
314+
}
315+
168316
func (c *Client) fetchHTML(ctx context.Context, urlStr string) (content string, links []string) {
169317
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, http.NoBody)
170318
if err != nil {
@@ -434,3 +582,59 @@ func dedupeLinks(links []string) []string {
434582
}
435583
return result
436584
}
585+
586+
// parseProfileFromHTML extracts profile data from GitHub HTML when API is unavailable.
587+
func (c *Client) parseProfileFromHTML(ctx context.Context, html, urlStr, username string) *profile.Profile {
588+
prof := &profile.Profile{
589+
Platform: platform,
590+
URL: urlStr,
591+
Authenticated: false,
592+
Username: username,
593+
Fields: make(map[string]string),
594+
}
595+
596+
// Extract full name: <span class="p-name vcard-fullname..." itemprop="name">
597+
namePattern := regexp.MustCompile(`<span[^>]+class="[^"]*p-name[^"]*"[^>]*itemprop="name"[^>]*>\s*([^<]+)`)
598+
if matches := namePattern.FindStringSubmatch(html); len(matches) > 1 {
599+
prof.Name = strings.TrimSpace(matches[1])
600+
}
601+
602+
// Extract bio: <div class="p-note user-profile-bio..." data-bio-text="...">
603+
bioPattern := regexp.MustCompile(`data-bio-text="([^"]+)"`)
604+
if matches := bioPattern.FindStringSubmatch(html); len(matches) > 1 {
605+
prof.Bio = strings.TrimSpace(matches[1])
606+
}
607+
608+
// Extract location: <li... itemprop="homeLocation"... aria-label="Home location: ...">
609+
locPattern := regexp.MustCompile(`itemprop="homeLocation"[^>]*aria-label="Home location:\s*([^"]+)"`)
610+
if matches := locPattern.FindStringSubmatch(html); len(matches) > 1 {
611+
prof.Location = strings.TrimSpace(matches[1])
612+
}
613+
614+
// Extract website: <li itemprop="url" data-test-selector="profile-website-url"...>...<a...href="...">
615+
websitePattern := regexp.MustCompile(`(?s)itemprop="url"[^>]*data-test-selector="profile-website-url"[^>]*>.*?href="([^"]+)"`)
616+
if matches := websitePattern.FindStringSubmatch(html); len(matches) > 1 {
617+
website := matches[1]
618+
if !strings.HasPrefix(website, "http") {
619+
website = "https://" + website
620+
}
621+
prof.Website = website
622+
prof.Fields["website"] = website
623+
}
624+
625+
// Extract avatar URL
626+
avatarPattern := regexp.MustCompile(`<img[^>]+class="[^"]*avatar avatar-user[^"]*"[^>]+src="([^"]+)"`)
627+
if matches := avatarPattern.FindStringSubmatch(html); len(matches) > 1 {
628+
prof.Fields["avatar_url"] = matches[1]
629+
}
630+
631+
c.logger.DebugContext(ctx, "parsed profile from HTML",
632+
"username", username,
633+
"name", prof.Name,
634+
"bio", prof.Bio,
635+
"location", prof.Location,
636+
"website", prof.Website,
637+
)
638+
639+
return prof
640+
}

0 commit comments

Comments
 (0)