Skip to content

Commit 80ce78a

Browse files
committed
feat: compatibility with web pdf and local file pdf
1 parent 80ea33c commit 80ce78a

4 files changed

Lines changed: 107 additions & 39 deletions

File tree

MapperAI.sln.DotSettings.user

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
22
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003AHttpClient_002Ecs_002Fl_003A_002E_002E_003F_002E_002E_003F_002E_002E_003F_002E_002E_003FAppData_003FRoaming_003FJetBrains_003FRider2024_002E3_003Fresharper_002Dhost_003FSourcesCache_003Fc439425da351c75ac7d966a1cc8324b51a9c471865af79d2f2f3fcb65e392_003FHttpClient_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
3-
<s:String x:Key="/Default/Environment/UnitTesting/UnitTestSessionStore/Sessions/=5ca51f47_002De3fa_002D40db_002D8e16_002D940aee47c197/@EntryIndexedValue">&lt;SessionState ContinuousTestingMode="0" IsActive="True" Name="Test_Should_Create_4_Files_With_CSharp_Extension" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"&gt;&#xD;
3+
<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=7020124F_002D9FFC_002D4AC3_002D8F3D_002DAAB8E0240759_002Ff_003APdfDocument_002Ecs_002Fl_003AC_0021_003FUsers_003FDri_003FAppData_003FRoaming_003FJetBrains_003FRider2025_002E1_003Fresharper_002Dhost_003FDecompilerCache_003Fdecompiler_003Fcbf868f2490746fbab2087b4b2ac8198ff600_003F67_003Ff809c9c7_003FPdfDocument_002Ecs/@EntryIndexedValue">ForceIncluded</s:String>
4+
<s:String x:Key="/Default/Environment/UnitTesting/UnitTestSessionStore/Sessions/=5ca51f47_002De3fa_002D40db_002D8e16_002D940aee47c197/@EntryIndexedValue">&lt;SessionState ContinuousTestingMode="0" Name="Test_Should_Create_4_Files_With_CSharp_Extension" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"&gt;&#xD;
45
&lt;TestAncestor&gt;&#xD;
56
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.FileMapperTests.Test_Should_Create_4_Files_With_CSharp_Extension&lt;/TestId&gt;&#xD;
67
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.ClassMapperTests.Test1&lt;/TestId&gt;&#xD;
78
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.PdfMapperTests.Test1&lt;/TestId&gt;&#xD;
89
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.FileMapperTests.Test_Should_Create_4_Files_With_Go_Extension&lt;/TestId&gt;&#xD;
10+
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.PdfMapperTests.TestWeb&lt;/TestId&gt;&#xD;
11+
&lt;/TestAncestor&gt;&#xD;
12+
&lt;/SessionState&gt;</s:String>
13+
<s:String x:Key="/Default/Environment/UnitTesting/UnitTestSessionStore/Sessions/=7b48511d_002D7b52_002D4ba3_002D89c2_002D2edf68a1cfe5/@EntryIndexedValue">&lt;SessionState ContinuousTestingMode="0" IsActive="True" Name="TestWeb2" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"&gt;&#xD;
14+
&lt;TestAncestor&gt;&#xD;
15+
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.PdfMapperIntegrationTests&lt;/TestId&gt;&#xD;
16+
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.DI.DependencyInjectionTests.AddMapperAI_ShouldRegisterServices&lt;/TestId&gt;&#xD;
917
&lt;/TestAncestor&gt;&#xD;
1018
&lt;/SessionState&gt;</s:String>
1119
<s:String x:Key="/Default/Environment/UnitTesting/UnitTestSessionStore/Sessions/=ee3fe810_002Dfb62_002D402b_002Db9c5_002D8dab803c815e/@EntryIndexedValue">&lt;SessionState ContinuousTestingMode="0" Name="Test1" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"&gt;&#xD;
@@ -18,5 +26,7 @@
1826
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.FileMapperTests.Test_Should_Create_4_Files_With_CSharp_Extension&lt;/TestId&gt;&#xD;
1927
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.DI.DependencyInjectionTests.AddMapperAI_ShouldRegisterServices&lt;/TestId&gt;&#xD;
2028
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.FileMapperTests&lt;/TestId&gt;&#xD;
29+
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.PdfMapperTests.TestWeb&lt;/TestId&gt;&#xD;
30+
&lt;TestId&gt;xUnit::8B3E109D-96CA-4B6D-B379-6AF70646DC25::net8.0::MapperAI.Test.PdfMapperTests.TestWeb2&lt;/TestId&gt;&#xD;
2131
&lt;/TestAncestor&gt;&#xD;
2232
&lt;/SessionState&gt;</s:String></wpf:ResourceDictionary>

src/MapperAI.Core/Mappers/Interfaces/IPDFMapper.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
public interface IPDFMapper
44
{
5-
Task<T?> MapAsync<T>(string pdfPath, CancellationToken cancellationToken = default)
5+
Task<T?> MapAsync<T>(string pdfPath, CancellationToken cancellationToken = default)
66
where T : class, new();
77

8+
89
}

src/MapperAI.Core/Mappers/PdfMapper.cs

Lines changed: 52 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,31 @@ public class PdfMapper : IPDFMapper
1313
private readonly IMapperSerializer _serializer;
1414
private readonly IMapperClientFactory _mapperClientFactory;
1515
private readonly MapperClientConfiguration _clientConfiguration;
16+
private readonly HttpClient? _httpClient;
1617

17-
public PdfMapper(IMapperSerializer serializer, IMapperClientFactory mapperClientFactory, MapperClientConfiguration clientConfiguration)
18+
public PdfMapper(IMapperSerializer serializer, IMapperClientFactory mapperClientFactory, MapperClientConfiguration clientConfiguration, HttpClient? httpClient = null)
1819
{
1920
_serializer = serializer;
2021
_mapperClientFactory = mapperClientFactory;
2122
_clientConfiguration = clientConfiguration;
23+
_httpClient = httpClient;
2224
}
2325

24-
25-
public async Task<T?> MapAsync<T>(string pdfPath, CancellationToken cancellationToken = default) where T : class, new()
26+
public async Task<T?> MapAsync<T>(string pdfPath, CancellationToken cancellationToken = default) where T : class, new()
2627
{
27-
IMapperClient iai = _mapperClientFactory.CreateClient(_clientConfiguration);
28-
string pdfContent = ExtractPdfContent(pdfPath);
29-
T destinyObject = new T();
28+
var isWeb = IsWebLink(pdfPath);
29+
if (isWeb && _httpClient == null) throw new ArgumentException("HttpClient instance is required");
30+
var iai = _mapperClientFactory.CreateClient(_clientConfiguration);
31+
var pdfContent = isWeb ? await ExtractPdfWebContent(pdfPath) : SerializePdfContent(new PdfReader(pdfPath));
32+
var destinyObject = new T();
3033
destinyObject.Initialize();
31-
string prompt = CreatePrompt(pdfContent, _serializer.Serialize(destinyObject));
32-
MapperClientResponse result = await iai.SendAsync(prompt, cancellationToken);
34+
var prompt = CreatePrompt(pdfContent, _serializer.Serialize(destinyObject));
35+
var result = await iai.SendAsync(prompt, cancellationToken);
3336
return _serializer.Deserialize<T>(result.Value);
3437
}
38+
3539

36-
private string CreatePrompt(string pdfContent, string classStructure)
40+
private static string CreatePrompt(string pdfContent, string classStructure)
3741
{
3842
return $"""
3943
You are a senior software engineer specializing in data extraction and mapping.
@@ -54,32 +58,59 @@ Do not include explanations or markdown formatting.
5458
}
5559

5660

57-
private string ExtractPdfContent(string pdfPath)
61+
private async Task<string> ExtractPdfWebContent(string pdfUri)
62+
{
63+
if (pdfUri.StartsWith("https://drive.google.com") && !pdfUri.Contains("uc?export=download"))
64+
pdfUri = ParseDriveUrl(pdfUri);
65+
66+
var requestResult = await _httpClient!.GetAsync(pdfUri);
67+
requestResult.EnsureSuccessStatusCode();
68+
var content = requestResult.Content;
69+
var stream = await content.ReadAsStreamAsync();
70+
var pdfReader = new PdfReader(stream);
71+
return SerializePdfContent(pdfReader);
72+
}
73+
74+
private string SerializePdfContent(PdfReader reader)
5875
{
59-
var pdfReader = new PdfReader(pdfPath);
60-
var pdfDoc = new PdfDocument(pdfReader);
76+
var pdfDoc = new PdfDocument(reader);
6177
var extractedData = new List<string>();
62-
78+
6379
for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
6480
{
6581
var page = pdfDoc.GetPage(i);
66-
string text = PdfTextExtractor.GetTextFromPage(page);
67-
string cleanedText = CleanText(text);
68-
82+
var text = PdfTextExtractor.GetTextFromPage(page);
83+
var cleanedText = CleanText(text);
84+
6985
extractedData.Add(cleanedText);
7086
}
71-
7287
return _serializer.Serialize(extractedData);
88+
}
89+
90+
private static string ParseDriveUrl(string pdfUri)
91+
{
92+
var uri = new Uri(pdfUri);
93+
94+
var segments = uri.Segments;
95+
string? fileId = null;
7396

97+
for (var i = 0; i < segments.Length; i++)
98+
{
99+
if (segments[i] != "d/" || i + 1 >= segments.Length) continue;
100+
fileId = segments[i + 1].TrimEnd('/');
101+
break;
102+
}
103+
if (string.IsNullOrEmpty(fileId)) throw new ArgumentException("Invalid drive link");
104+
return $"https://drive.google.com/uc?export=download&id={fileId}";
74105
}
75-
76-
private string CleanText(string input)
106+
private static string CleanText(string input)
77107
{
78-
return string.Join(" ", input.Split(new[] { '\n', '\r' },
108+
return string.Join(" ", input.Split(['\n', '\r'],
79109
StringSplitOptions.RemoveEmptyEntries))
80110
.Replace("\\n", " ")
81111
.Replace("\\r", " ")
82112
.Trim();
83113
}
84-
114+
115+
private static bool IsWebLink(string pdfPath) => pdfPath.StartsWith("https://") || pdfPath.StartsWith("http://");
85116
}

test/MapperAI.Test/PdfMapperTests.cs

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,53 @@
55

66
namespace MapperAI.Test;
77

8-
public class PdfMapperTests : BaseTests
8+
public class PdfMapperIntegrationTests : BaseTests
99
{
1010
private readonly IPDFMapper _pdfMapper;
1111

12-
public PdfMapperTests()
12+
public PdfMapperIntegrationTests()
1313
{
14-
var clientConfiguration = new MapperClientConfiguration( Environment.GetEnvironmentVariable("GEMINI_KEY"),ModelType.GeminiFlash2_0);
15-
_pdfMapper = new PdfMapper(Serializer, Factory, clientConfiguration);
14+
var clientConfiguration = new MapperClientConfiguration("AIzaSyCiJxEdi-yzBg5GPsPhh6etWEKXzZATXtU", ModelType.GeminiFlash2_0);
15+
_pdfMapper = new PdfMapper(Serializer, Factory, clientConfiguration, new HttpClient());
1616
}
1717

1818
[Fact]
19-
public async Task Test1()
19+
public async Task Should_Map_Curriculum_From_GoogleDrive()
2020
{
21-
var pdfPath = Path.Combine(@"../../../Curriculo - Diego.pdf");
22-
CurriculumModel? curriculumModel = await _pdfMapper.MapAsync<CurriculumModel>(pdfPath);
23-
Assert.Contains("Uninter", curriculumModel?.Faculdade);
24-
Assert.Equal("Análise e desenvolvimento de sistemas EAD", curriculumModel?.Curso);
25-
Assert.Equal(2, curriculumModel?.Projects.Count);
26-
Assert.Equal("diegomagalhaesdev@gmail.com", curriculumModel?.Email);
27-
28-
var expectedProjectNames = new List<string> { "ReclameTrancoso", "VTHoftalon" };
29-
var actualProjectNames = curriculumModel?.Projects.Select(p => p.Nome).ToList();
21+
var model = await _pdfMapper.MapAsync<CurriculumModel>(
22+
"https://drive.google.com/file/d/1ByhxqDtlX2d_jnmxF8kqgPZJNKs54k4R/view?usp=drive_link");
23+
24+
Assert.NotNull(model);
25+
Assert.Equal("Análise e desenvolvimento de sistemas EAD", model?.Curso);
26+
Assert.Contains("Uninter", model?.Faculdade);
27+
Assert.Equal("diegomagalhaesdev@gmail.com", model?.Email);
28+
}
3029

31-
Assert.Equal(expectedProjectNames, actualProjectNames);
30+
[Fact]
31+
public async Task Should_Map_PenalCode_From_Senado()
32+
{
33+
var model = await _pdfMapper.MapAsync<CodigoPenal>(
34+
"https://www2.senado.leg.br/bdsf/bitstream/handle/id/529748/codigo_penal_1ed.pdf");
3235

36+
Assert.NotNull(model);
37+
Assert.Equal("Senador Eunício Oliveira", model?.Presidente);
38+
Assert.Equal(4, model?.SuplentesDeSecretario.Count);
39+
Assert.Contains("Senador Eduardo Amorim", model?.SuplentesDeSecretario);
3340
}
34-
}
3541

42+
[Fact]
43+
public async Task Should_Map_Curriculum_From_LocalFile()
44+
{
45+
var pdfPath = Path.Combine("../../../Curriculo - Diego.pdf");
46+
47+
var model = await _pdfMapper.MapAsync<CurriculumModel>(pdfPath);
48+
49+
Assert.NotNull(model);
50+
Assert.Equal("Análise e desenvolvimento de sistemas EAD", model?.Curso);
51+
Assert.Equal("diegomagalhaesdev@gmail.com", model?.Email);
52+
Assert.True(model?.Projects.Count > 0);
53+
}
54+
}
3655

3756
public class CurriculumModel
3857
{
@@ -50,4 +69,11 @@ public class CurriculumProjects
5069
{
5170
public string Nome { get; set; }
5271
public List<string> Tecnologias { get; set; } = new List<string>();
72+
}
73+
74+
public class CodigoPenal
75+
{
76+
public string? Presidente { get; set;}
77+
public List<string> SuplentesDeSecretario { get; set; } = [];
78+
5379
}

0 commit comments

Comments
 (0)