@@ -13,27 +13,31 @@ public class PdfMapper : IPDFMapper
1313 private readonly IMapperSerializer _serializer ;
1414 private readonly IMapperClientFactory _mapperClientFactory ;
1515 private readonly MapperClientConfiguration _clientConfiguration ;
16+ private readonly HttpClient ? _httpClient ;
1617
17- public PdfMapper ( IMapperSerializer serializer , IMapperClientFactory mapperClientFactory , MapperClientConfiguration clientConfiguration )
18+ public PdfMapper ( IMapperSerializer serializer , IMapperClientFactory mapperClientFactory , MapperClientConfiguration clientConfiguration , HttpClient ? httpClient = null )
1819 {
1920 _serializer = serializer ;
2021 _mapperClientFactory = mapperClientFactory ;
2122 _clientConfiguration = clientConfiguration ;
23+ _httpClient = httpClient ;
2224 }
2325
24-
25- public async Task < T ? > MapAsync < T > ( string pdfPath , CancellationToken cancellationToken = default ) where T : class , new ( )
26+ public async Task < T ? > MapAsync < T > ( string pdfPath , CancellationToken cancellationToken = default ) where T : class , new ( )
2627 {
27- IMapperClient iai = _mapperClientFactory . CreateClient ( _clientConfiguration ) ;
28- string pdfContent = ExtractPdfContent ( pdfPath ) ;
29- T destinyObject = new T ( ) ;
28+ var isWeb = IsWebLink ( pdfPath ) ;
29+ if ( isWeb && _httpClient == null ) throw new ArgumentException ( "HttpClient instance is required" ) ;
30+ var iai = _mapperClientFactory . CreateClient ( _clientConfiguration ) ;
31+ var pdfContent = isWeb ? await ExtractPdfWebContent ( pdfPath ) : SerializePdfContent ( new PdfReader ( pdfPath ) ) ;
32+ var destinyObject = new T ( ) ;
3033 destinyObject . Initialize ( ) ;
31- string prompt = CreatePrompt ( pdfContent , _serializer . Serialize ( destinyObject ) ) ;
32- MapperClientResponse result = await iai . SendAsync ( prompt , cancellationToken ) ;
34+ var prompt = CreatePrompt ( pdfContent , _serializer . Serialize ( destinyObject ) ) ;
35+ var result = await iai . SendAsync ( prompt , cancellationToken ) ;
3336 return _serializer . Deserialize < T > ( result . Value ) ;
3437 }
38+
3539
36- private string CreatePrompt ( string pdfContent , string classStructure )
40+ private static string CreatePrompt ( string pdfContent , string classStructure )
3741 {
3842 return $ """
3943 You are a senior software engineer specializing in data extraction and mapping.
@@ -54,32 +58,59 @@ Do not include explanations or markdown formatting.
5458 }
5559
5660
57- private string ExtractPdfContent ( string pdfPath )
61+ private async Task < string > ExtractPdfWebContent ( string pdfUri )
62+ {
63+ if ( pdfUri . StartsWith ( "https://drive.google.com" ) && ! pdfUri . Contains ( "uc?export=download" ) )
64+ pdfUri = ParseDriveUrl ( pdfUri ) ;
65+
66+ var requestResult = await _httpClient ! . GetAsync ( pdfUri ) ;
67+ requestResult . EnsureSuccessStatusCode ( ) ;
68+ var content = requestResult . Content ;
69+ var stream = await content . ReadAsStreamAsync ( ) ;
70+ var pdfReader = new PdfReader ( stream ) ;
71+ return SerializePdfContent ( pdfReader ) ;
72+ }
73+
74+ private string SerializePdfContent ( PdfReader reader )
5875 {
59- var pdfReader = new PdfReader ( pdfPath ) ;
60- var pdfDoc = new PdfDocument ( pdfReader ) ;
76+ var pdfDoc = new PdfDocument ( reader ) ;
6177 var extractedData = new List < string > ( ) ;
62-
78+
6379 for ( int i = 1 ; i <= pdfDoc . GetNumberOfPages ( ) ; i ++ )
6480 {
6581 var page = pdfDoc . GetPage ( i ) ;
66- string text = PdfTextExtractor . GetTextFromPage ( page ) ;
67- string cleanedText = CleanText ( text ) ;
68-
82+ var text = PdfTextExtractor . GetTextFromPage ( page ) ;
83+ var cleanedText = CleanText ( text ) ;
84+
6985 extractedData . Add ( cleanedText ) ;
7086 }
71-
7287 return _serializer . Serialize ( extractedData ) ;
88+ }
89+
90+ private static string ParseDriveUrl ( string pdfUri )
91+ {
92+ var uri = new Uri ( pdfUri ) ;
93+
94+ var segments = uri . Segments ;
95+ string ? fileId = null ;
7396
97+ for ( var i = 0 ; i < segments . Length ; i ++ )
98+ {
99+ if ( segments [ i ] != "d/" || i + 1 >= segments . Length ) continue ;
100+ fileId = segments [ i + 1 ] . TrimEnd ( '/' ) ;
101+ break ;
102+ }
103+ if ( string . IsNullOrEmpty ( fileId ) ) throw new ArgumentException ( "Invalid drive link" ) ;
104+ return $ "https://drive.google.com/uc?export=download&id={ fileId } ";
74105 }
75-
76- private string CleanText ( string input )
106+ private static string CleanText ( string input )
77107 {
78- return string . Join ( " " , input . Split ( new [ ] { '\n ' , '\r ' } ,
108+ return string . Join ( " " , input . Split ( [ '\n ' , '\r ' ] ,
79109 StringSplitOptions . RemoveEmptyEntries ) )
80110 . Replace ( "\\ n" , " " )
81111 . Replace ( "\\ r" , " " )
82112 . Trim ( ) ;
83113 }
84-
114+
115+ private static bool IsWebLink ( string pdfPath ) => pdfPath . StartsWith ( "https://" ) || pdfPath . StartsWith ( "http://" ) ;
85116}
0 commit comments