Skip to content

Commit 910dbbf

Browse files
committed
Merge branch 'issue-04' of github.com:eduranm/markapi into eduranm-issue-04
2 parents 7a04334 + ad725c4 commit 910dbbf

7 files changed

Lines changed: 266 additions & 5 deletions

File tree

config/api_router.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
from rest_framework.routers import DefaultRouter, SimpleRouter
33

44
from reference.api.v1.views import ReferenceViewSet
5-
6-
app_name = "reference"
5+
from markup_doc.api.v1.views import ArticleViewSet
76

87
if settings.DEBUG:
98
router = DefaultRouter()
109
else:
1110
router = SimpleRouter()
1211

1312
router.register("reference", ReferenceViewSet, basename="reference")
13+
router.register("first_block", ArticleViewSet, basename="first_block")
1414

1515
urlpatterns = router.urls

markup_doc/api/__init__.py

Whitespace-only changes.

markup_doc/api/v1/__init__.py

Whitespace-only changes.

markup_doc/api/v1/serializers.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from rest_framework import serializers
2+
from markup_doc.models import ArticleDocx
3+
4+
class ArticleDocxSerializer(serializers.ModelSerializer):
5+
class Meta:
6+
model = ArticleDocx
7+
fields = "__all__"

markup_doc/api/v1/views.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from django.shortcuts import render
2+
from django.http import JsonResponse
3+
from rest_framework.permissions import IsAuthenticated
4+
from rest_framework.viewsets import GenericViewSet
5+
from rest_framework.mixins import CreateModelMixin
6+
from rest_framework.response import Response
7+
from markup_doc.api.v1.serializers import ArticleDocxSerializer
8+
from markup_doc.marker import mark_article
9+
10+
import json
11+
12+
# Create your views here.
13+
14+
class ArticleViewSet(
15+
GenericViewSet, # generic view functionality
16+
CreateModelMixin, # handles POSTs
17+
):
18+
serializer_class = ArticleDocxSerializer
19+
permission_classes = [IsAuthenticated]
20+
http_method_names = [
21+
"post",
22+
]
23+
24+
def create(self, request, *args, **kwargs):
25+
return self.api_article(request)
26+
27+
def api_article(self, request):
28+
try:
29+
data = json.loads(request.body)
30+
post_text = data.get('text') # Obtiene el parámetro
31+
post_metadata = data.get('metadata') # Obtiene el parámetro
32+
33+
resp_data = mark_article(post_text, post_metadata)
34+
35+
response_data = {
36+
'message': resp_data,
37+
}
38+
except json.JSONDecodeError:
39+
response_data = {
40+
'error': 'Error processing'
41+
}
42+
43+
return JsonResponse(response_data)

markup_doc/labeling_utils.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -780,3 +780,59 @@ def create_labeled_object2(i, item, state, sections):
780780
}
781781

782782
return obj, result, state
783+
784+
785+
def get_data_first_block(text, metadata, user_id):
786+
payload = {
787+
'text': text,
788+
'metadata': metadata
789+
}
790+
791+
model = LlamaModel.objects.first()
792+
793+
if model.name_file:
794+
user = User.objects.get(pk=user_id)
795+
refresh = RefreshToken.for_user(user)
796+
access_token = refresh.access_token
797+
798+
# FIXME: Hardcoded URL
799+
url = "http://django:8000/api/v1/first_block/"
800+
801+
headers = {
802+
'Authorization': f'Bearer {access_token}',
803+
'Content-Type': 'application/json'
804+
}
805+
806+
response = requests.post(url, json=payload, headers=headers)
807+
808+
if response.status_code == 200:
809+
response_json = response.json()
810+
message_str = response_json['message']
811+
812+
resp_json = json.loads(message_str)
813+
814+
return resp_json
815+
816+
817+
def extract_keywords(text):
818+
# Quitar punto final si existe
819+
text = text.strip()
820+
if text.endswith('.'):
821+
text = text[:-1].strip()
822+
823+
# Ver si contiene una etiqueta con dos puntos
824+
match = re.match(r'(?i)\s*(.+?)\s*:\s*(.+)', text)
825+
826+
if match:
827+
label = match.group(1).strip()
828+
content = match.group(2).strip()
829+
else:
830+
label = None
831+
content = text
832+
833+
# Separar por punto y coma o coma
834+
keywords = re.split(r'\s*[;,]\s*', content)
835+
clean_keywords = [p.strip() for p in keywords if p.strip()]
836+
clean_keywords = ", ".join(keywords)
837+
838+
return {"title": label, "keywords": clean_keywords}

markup_doc/tasks.py

Lines changed: 158 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,45 @@
55
import json
66
import re
77

8+
# Third-party imports
9+
import langid
10+
811
from markup_doc.models import UploadDocx
912
from markup_doc.labeling_utils import (
1013
split_in_three,
1114
process_reference,
1215
process_references,
16+
extract_keywords,
1317
create_labeled_object2,
18+
get_data_first_block,
1419
get_llm_model_name
1520
)
1621

1722
from markup_doc.models import ProcessStatus
1823
from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA
1924
from markuplib.function_docx import functionsDocx
20-
from model_ai.llama import LlamaService
25+
from model_ai.llama import LlamaService, LlamaInputSettings
2126
from reference.config_gemini import create_prompt_reference
2227
from markup_doc.sync_api import sync_journals_from_api
2328

2429

30+
def clean_labels(text):
31+
# Eliminar etiquetas tipo [kwd] o [sectitle], incluso si tienen espacios como [/ doctitle ]
32+
text = re.sub(r'\[\s*/?\s*\w+(?:\s+[^\]]+)?\s*\]', '', text)
33+
34+
# Reemplazar múltiples espacios por uno solo
35+
text = re.sub(r'[ \t]+', ' ', text)
36+
37+
# Eliminar espacios antes de los signos de puntuación
38+
text = re.sub(r'\s+([;:,.])', r'\1', text)
39+
40+
# Normalizar múltiples saltos de línea
41+
text = re.sub(r'\n+', '\n', text)
42+
43+
# Quitar espacios al principio y final
44+
return text.strip()
45+
46+
2547
@celery_app.task()
2648
def task_sync_journals_from_api():
2749
sync_journals_from_api()
@@ -67,6 +89,141 @@ def get_labels(title, user_id):
6789
continue
6890

6991
obj = {}
92+
if item.get('type') in [
93+
'<abstract>',
94+
'<date-accepted>',
95+
'<date-received>',
96+
'<kwd-group>'
97+
]:
98+
if item.get('type') == '<abstract>':
99+
if i + 1 < len(content):
100+
obj['type'] = 'paragraph'
101+
obj['value'] = {
102+
'label': '<abstract-title>',
103+
'paragraph': item.get('text')
104+
}
105+
stream_data.append(obj.copy())
106+
107+
next_item = content[i + 1]
108+
obj['type'] = 'paragraph_with_language'
109+
obj['value'] = {
110+
'label': '<abstract>',
111+
'paragraph': next_item.get('text'),
112+
'language': langid.classify(next_item.get('text'))[0] or None
113+
}
114+
stream_data.append(obj.copy())
115+
116+
elif item.get('type') == '<kwd-group>':
117+
keywords = extract_keywords(item.get('text'))
118+
obj['type'] = 'paragraph'
119+
obj['value'] = {
120+
'label': '<kwd-title>',
121+
'paragraph': keywords['title']
122+
}
123+
stream_data.append(obj.copy())
124+
125+
obj['type'] = 'paragraph_with_language'
126+
obj['value'] = {
127+
'label': '<kwd-group>',
128+
'paragraph': keywords['keywords'],
129+
'language': langid.classify(keywords['title'].replace('<italic>', '').replace('</italic>', ''))[0] or None
130+
}
131+
stream_data.append(obj.copy())
132+
133+
else:
134+
obj['type'] = 'paragraph'
135+
obj['value'] = {
136+
'label': item.get('type') ,
137+
'paragraph': item.get('text')
138+
}
139+
stream_data.append(obj.copy())
140+
continue
141+
142+
if item.get('type') == 'first_block':
143+
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
144+
145+
if get_llm_model_name() == MODEL_NAME_GEMINI:
146+
output = llm_first_block.run(LlamaInputSettings.get_first_metadata(clean_labels(item.get('text'))))
147+
match = re.search(r'\{.*\}', output, re.DOTALL)
148+
if match:
149+
output = match.group(0)
150+
output = json.loads(output)
151+
152+
if get_llm_model_name() == MODEL_NAME_LLAMA:
153+
154+
output_author = get_data_first_block(clean_labels(item.get('text')), 'author', user_id)
155+
156+
output_affiliation = get_data_first_block(clean_labels(item.get('text')), 'affiliation', user_id)
157+
158+
output_doi = get_data_first_block(clean_labels(item.get('text')), 'doi', user_id)
159+
160+
output_title = get_data_first_block(clean_labels(item.get('text')), 'title', user_id)
161+
162+
# 1. Parsear cada salida
163+
doi_section = output_doi
164+
titles = output_title
165+
authors = output_author
166+
affiliations = output_affiliation
167+
168+
# 2. Combinar en un único JSON
169+
output = {
170+
"doi": doi_section.get("doi", ""),
171+
"section": doi_section.get("section", ""),
172+
"titles": titles,
173+
"authors": authors,
174+
"affiliations": affiliations
175+
}
176+
177+
obj['type'] = 'paragraph'
178+
obj['value'] = {
179+
'label': '<article-id>',
180+
'paragraph': output['doi']
181+
}
182+
stream_data.append(obj.copy())
183+
obj['value'] = {
184+
'label': '<subject>',
185+
'paragraph': output['section']
186+
}
187+
stream_data.append(obj.copy())
188+
for i, tit in enumerate(output['titles']):
189+
obj['type'] = 'paragraph_with_language'
190+
obj['value'] = {
191+
'label': '<article-title>' if i == 0 else '<trans-title>',
192+
'paragraph': tit['title'],
193+
'language': tit['language']
194+
}
195+
stream_data.append(obj.copy())
196+
197+
for i, auth in enumerate(output['authors']):
198+
obj['type'] = 'author_paragraph'
199+
obj['value'] = {
200+
'label': '<contrib>',
201+
'surname': auth['surname'],
202+
'given_names': auth['name'],
203+
'orcid': auth['orcid'],
204+
'affid': auth['aff'],
205+
'char': auth['char']
206+
}
207+
stream_data.append(obj.copy())
208+
209+
for i, aff in enumerate(output['affiliations']):
210+
obj['type'] = 'aff_paragraph'
211+
obj['value'] = {
212+
'label': '<aff>',
213+
'affid': aff['aff'],
214+
'char': aff['char'],
215+
'orgname': aff['orgname'],
216+
'orgdiv2': aff['orgdiv2'],
217+
'orgdiv1': aff['orgdiv1'],
218+
'zipcode': aff['postal'],
219+
'city': aff['city'],
220+
'country': aff['name_country'],
221+
'code_country': aff['code_country'],
222+
'state': aff['state'],
223+
'text_aff': aff['text_aff'],
224+
#'original': aff['original']
225+
}
226+
stream_data.append(obj.copy())
70227

71228
if item.get('text') is None or item.get('text') == '':
72229
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
@@ -119,8 +276,6 @@ def get_labels(title, user_id):
119276
chunks = split_in_three(obj_reference)
120277
output=[]
121278

122-
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
123-
124279
for chunk in chunks:
125280
if len(chunk) > 0:
126281
text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '')

0 commit comments

Comments
 (0)