@@ -4,9 +4,12 @@ package github
44import (
55 "context"
66 "encoding/json"
7+ "errors"
78 "fmt"
9+ "io"
810 "log/slog"
911 "net/http"
12+ "os"
1013 "regexp"
1114 "strconv"
1215 "strings"
@@ -62,6 +65,7 @@ type Client struct {
6265 httpClient * http.Client
6366 cache cache.HTTPCache
6467 logger * slog.Logger
68+ token string
6569}
6670
6771// Option configures a Client.
@@ -70,6 +74,7 @@ type Option func(*config)
7074type config struct {
7175 cache cache.HTTPCache
7276 logger * slog.Logger
77+ token string
7378}
7479
7580// WithHTTPCache sets the HTTP cache.
@@ -82,17 +87,41 @@ func WithLogger(logger *slog.Logger) Option {
8287 return func (c * config ) { c .logger = logger }
8388}
8489
90+ // WithToken sets the GitHub API token.
91+ func WithToken (token string ) Option {
92+ return func (c * config ) { c .token = token }
93+ }
94+
8595// New creates a GitHub client.
8696func New (ctx context.Context , opts ... Option ) (* Client , error ) {
8797 cfg := & config {logger : slog .Default ()}
8898 for _ , opt := range opts {
8999 opt (cfg )
90100 }
91101
102+ // Ensure logger is not nil
103+ logger := cfg .logger
104+ if logger == nil {
105+ logger = slog .Default ()
106+ }
107+
108+ // Try to get token from environment if not provided
109+ token := cfg .token
110+ if token == "" {
111+ token = os .Getenv ("GITHUB_TOKEN" )
112+ }
113+
114+ if token == "" {
115+ logger .WarnContext (ctx , "GITHUB_TOKEN not set - GitHub API requests will be rate-limited to 60/hour" )
116+ } else {
117+ logger .InfoContext (ctx , "using GITHUB_TOKEN for authenticated API requests" )
118+ }
119+
92120 return & Client {
93121 httpClient : & http.Client {Timeout : 3 * time .Second },
94122 cache : cfg .cache ,
95- logger : cfg .logger ,
123+ logger : logger ,
124+ token : token ,
96125 }, nil
97126}
98127
@@ -110,14 +139,43 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
110139
111140 c .logger .InfoContext (ctx , "fetching github profile" , "url" , urlStr , "username" , username )
112141
113- // Fetch API data
114- prof , err := c .fetchAPI (ctx , urlStr , username )
115- if err != nil {
116- return nil , err
117- }
142+ // Fetch API data, with fallback to HTML scraping on failure
143+ prof , apiErr := c .fetchAPI (ctx , urlStr , username )
118144
119145 // Fetch HTML to extract rel="me" links, README, and organizations
120146 htmlContent , htmlLinks := c .fetchHTML (ctx , urlStr )
147+
148+ // If API failed, try to build profile from HTML
149+ if apiErr != nil {
150+ var gitHubAPIErr * APIError
151+ if errors .As (apiErr , & gitHubAPIErr ) {
152+ if gitHubAPIErr .IsRateLimit {
153+ c .logger .WarnContext (ctx , "GitHub API rate limited, falling back to HTML scraping" ,
154+ "url" , urlStr ,
155+ "reset_time" , gitHubAPIErr .RateLimitReset .Format (time .RFC3339 ),
156+ )
157+ } else {
158+ c .logger .WarnContext (ctx , "GitHub API access denied, falling back to HTML scraping" ,
159+ "url" , urlStr ,
160+ "status" , gitHubAPIErr .StatusCode ,
161+ )
162+ }
163+ } else {
164+ c .logger .WarnContext (ctx , "GitHub API request failed, falling back to HTML scraping" ,
165+ "url" , urlStr ,
166+ "error" , apiErr ,
167+ )
168+ }
169+
170+ // Try to build profile from HTML
171+ if htmlContent == "" {
172+ return nil , fmt .Errorf ("API failed and no HTML content available: %w" , apiErr )
173+ }
174+
175+ prof = c .parseProfileFromHTML (ctx , htmlContent , urlStr , username )
176+ c .logger .InfoContext (ctx , "built profile from HTML scraping" , "url" , urlStr , "username" , username )
177+ }
178+
121179 prof .SocialLinks = append (prof .SocialLinks , htmlLinks ... )
122180
123181 // Extract README and organizations from HTML if available
@@ -147,6 +205,24 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
147205 return prof , nil
148206}
149207
208+ // APIError contains details about a GitHub API error.
209+ //
210+ //nolint:govet // fieldalignment: intentional layout for readability
211+ type APIError struct {
212+ StatusCode int
213+ RateLimitRemain int
214+ RateLimitReset time.Time
215+ Message string
216+ IsRateLimit bool
217+ }
218+
219+ func (e * APIError ) Error () string {
220+ if e .IsRateLimit {
221+ return fmt .Sprintf ("GitHub API rate limited (resets at %s): %s" , e .RateLimitReset .Format (time .RFC3339 ), e .Message )
222+ }
223+ return fmt .Sprintf ("GitHub API error %d: %s" , e .StatusCode , e .Message )
224+ }
225+
150226func (c * Client ) fetchAPI (ctx context.Context , urlStr , username string ) (* profile.Profile , error ) {
151227 apiURL := "https://api.github.com/users/" + username
152228
@@ -157,14 +233,86 @@ func (c *Client) fetchAPI(ctx context.Context, urlStr, username string) (*profil
157233 req .Header .Set ("Accept" , "application/vnd.github.v3+json" )
158234 req .Header .Set ("User-Agent" , "sociopath/1.0" )
159235
160- body , err := cache .FetchURL (ctx , c .cache , c .httpClient , req , c .logger )
236+ if c .token != "" {
237+ req .Header .Set ("Authorization" , "Bearer " + c .token )
238+ }
239+
240+ body , err := c .doAPIRequest (ctx , req )
161241 if err != nil {
162242 return nil , err
163243 }
164244
165245 return parseJSON (body , urlStr , username )
166246}
167247
248+ func (c * Client ) doAPIRequest (ctx context.Context , req * http.Request ) ([]byte , error ) {
249+ // Check cache first
250+ cacheKey := req .URL .String ()
251+ if c .cache != nil {
252+ if data , _ , _ , found := c .cache .Get (ctx , cacheKey ); found {
253+ c .cache .RecordHit ()
254+ if s := string (data ); strings .HasPrefix (s , "ERROR:" ) {
255+ code , _ := strconv .Atoi (strings .TrimPrefix (s , "ERROR:" )) //nolint:errcheck // parse error defaults to 0 which is acceptable
256+ c .logger .DebugContext (ctx , "cache hit (error)" , "key" , cacheKey , "status" , code )
257+ return nil , & APIError {StatusCode : code , Message : "cached error" }
258+ }
259+ c .logger .DebugContext (ctx , "cache hit" , "key" , cacheKey )
260+ return data , nil
261+ }
262+ c .cache .RecordMiss ()
263+ c .logger .InfoContext (ctx , "cache miss" , "url" , req .URL .String ())
264+ } else {
265+ c .logger .InfoContext (ctx , "cache disabled" , "url" , req .URL .String ())
266+ }
267+
268+ resp , err := c .httpClient .Do (req )
269+ if err != nil {
270+ return nil , err
271+ }
272+ defer func () { _ = resp .Body .Close () }() //nolint:errcheck // error ignored intentionally
273+
274+ // Parse rate limit headers (GitHub uses non-canonical casing, parse errors default to 0)
275+ rateLimitRemain , _ := strconv .Atoi (resp .Header .Get ("X-RateLimit-Remaining" )) //nolint:errcheck,canonicalheader // ok
276+ rateLimitReset , _ := strconv .ParseInt (resp .Header .Get ("X-RateLimit-Reset" ), 10 , 64 ) //nolint:errcheck,canonicalheader // ok
277+ resetTime := time .Unix (rateLimitReset , 0 )
278+
279+ if resp .StatusCode != http .StatusOK {
280+ body , _ := io .ReadAll (resp .Body ) //nolint:errcheck // best effort read of error body
281+ isRateLimit := resp .StatusCode == http .StatusForbidden && rateLimitRemain == 0
282+
283+ apiErr := & APIError {
284+ StatusCode : resp .StatusCode ,
285+ RateLimitRemain : rateLimitRemain ,
286+ RateLimitReset : resetTime ,
287+ Message : string (body ),
288+ IsRateLimit : isRateLimit ,
289+ }
290+
291+ c .logger .WarnContext (ctx , "GitHub API request failed" ,
292+ "url" , req .URL .String (),
293+ "status" , resp .StatusCode ,
294+ "rate_limit_remaining" , rateLimitRemain ,
295+ "rate_limit_reset" , resetTime .Format (time .RFC3339 ),
296+ "is_rate_limit" , isRateLimit ,
297+ "response_body" , string (body ),
298+ )
299+
300+ return nil , apiErr
301+ }
302+
303+ body , err := io .ReadAll (resp .Body )
304+ if err != nil {
305+ return nil , err
306+ }
307+
308+ // Cache successful response
309+ if c .cache != nil {
310+ _ = c .cache .SetAsync (ctx , cacheKey , body , "" , nil ) //nolint:errcheck // async write errors are non-fatal
311+ }
312+
313+ return body , nil
314+ }
315+
168316func (c * Client ) fetchHTML (ctx context.Context , urlStr string ) (content string , links []string ) {
169317 req , err := http .NewRequestWithContext (ctx , http .MethodGet , urlStr , http .NoBody )
170318 if err != nil {
@@ -434,3 +582,59 @@ func dedupeLinks(links []string) []string {
434582 }
435583 return result
436584}
585+
586+ // parseProfileFromHTML extracts profile data from GitHub HTML when API is unavailable.
587+ func (c * Client ) parseProfileFromHTML (ctx context.Context , html , urlStr , username string ) * profile.Profile {
588+ prof := & profile.Profile {
589+ Platform : platform ,
590+ URL : urlStr ,
591+ Authenticated : false ,
592+ Username : username ,
593+ Fields : make (map [string ]string ),
594+ }
595+
596+ // Extract full name: <span class="p-name vcard-fullname..." itemprop="name">
597+ namePattern := regexp .MustCompile (`<span[^>]+class="[^"]*p-name[^"]*"[^>]*itemprop="name"[^>]*>\s*([^<]+)` )
598+ if matches := namePattern .FindStringSubmatch (html ); len (matches ) > 1 {
599+ prof .Name = strings .TrimSpace (matches [1 ])
600+ }
601+
602+ // Extract bio: <div class="p-note user-profile-bio..." data-bio-text="...">
603+ bioPattern := regexp .MustCompile (`data-bio-text="([^"]+)"` )
604+ if matches := bioPattern .FindStringSubmatch (html ); len (matches ) > 1 {
605+ prof .Bio = strings .TrimSpace (matches [1 ])
606+ }
607+
608+ // Extract location: <li... itemprop="homeLocation"... aria-label="Home location: ...">
609+ locPattern := regexp .MustCompile (`itemprop="homeLocation"[^>]*aria-label="Home location:\s*([^"]+)"` )
610+ if matches := locPattern .FindStringSubmatch (html ); len (matches ) > 1 {
611+ prof .Location = strings .TrimSpace (matches [1 ])
612+ }
613+
614+ // Extract website: <li itemprop="url" data-test-selector="profile-website-url"...>...<a...href="...">
615+ websitePattern := regexp .MustCompile (`(?s)itemprop="url"[^>]*data-test-selector="profile-website-url"[^>]*>.*?href="([^"]+)"` )
616+ if matches := websitePattern .FindStringSubmatch (html ); len (matches ) > 1 {
617+ website := matches [1 ]
618+ if ! strings .HasPrefix (website , "http" ) {
619+ website = "https://" + website
620+ }
621+ prof .Website = website
622+ prof .Fields ["website" ] = website
623+ }
624+
625+ // Extract avatar URL
626+ avatarPattern := regexp .MustCompile (`<img[^>]+class="[^"]*avatar avatar-user[^"]*"[^>]+src="([^"]+)"` )
627+ if matches := avatarPattern .FindStringSubmatch (html ); len (matches ) > 1 {
628+ prof .Fields ["avatar_url" ] = matches [1 ]
629+ }
630+
631+ c .logger .DebugContext (ctx , "parsed profile from HTML" ,
632+ "username" , username ,
633+ "name" , prof .Name ,
634+ "bio" , prof .Bio ,
635+ "location" , prof .Location ,
636+ "website" , prof .Website ,
637+ )
638+
639+ return prof
640+ }
0 commit comments