Skip to content

Commit ad725c4

Browse files
committed
Integrar la identificación de elementos del front en get_labels
1 parent 0cd08bb commit ad725c4

1 file changed

Lines changed: 158 additions & 3 deletions

File tree

markup_doc/tasks.py

Lines changed: 158 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,45 @@
55
import json
66
import re
77

8+
# Third-party imports
9+
import langid
10+
811
from markup_doc.models import UploadDocx
912
from markup_doc.labeling_utils import (
1013
split_in_three,
1114
process_reference,
1215
process_references,
16+
extract_keywords,
1317
create_labeled_object2,
18+
get_data_first_block,
1419
get_llm_model_name
1520
)
1621

1722
from markup_doc.models import ProcessStatus
1823
from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA
1924
from markuplib.function_docx import functionsDocx
20-
from model_ai.llama import LlamaService
25+
from model_ai.llama import LlamaService, LlamaInputSettings
2126
from reference.config_gemini import create_prompt_reference
2227
from markup_doc.sync_api import sync_journals_from_api
2328

2429

30+
def clean_labels(text):
31+
# Eliminar etiquetas tipo [kwd] o [sectitle], incluso si tienen espacios como [/ doctitle ]
32+
text = re.sub(r'\[\s*/?\s*\w+(?:\s+[^\]]+)?\s*\]', '', text)
33+
34+
# Reemplazar múltiples espacios por uno solo
35+
text = re.sub(r'[ \t]+', ' ', text)
36+
37+
# Eliminar espacios antes de los signos de puntuación
38+
text = re.sub(r'\s+([;:,.])', r'\1', text)
39+
40+
# Normalizar múltiples saltos de línea
41+
text = re.sub(r'\n+', '\n', text)
42+
43+
# Quitar espacios al principio y final
44+
return text.strip()
45+
46+
2547
@celery_app.task()
2648
def task_sync_journals_from_api():
2749
sync_journals_from_api()
@@ -67,6 +89,141 @@ def get_labels(title, user_id):
6789
continue
6890

6991
obj = {}
92+
if item.get('type') in [
93+
'<abstract>',
94+
'<date-accepted>',
95+
'<date-received>',
96+
'<kwd-group>'
97+
]:
98+
if item.get('type') == '<abstract>':
99+
if i + 1 < len(content):
100+
obj['type'] = 'paragraph'
101+
obj['value'] = {
102+
'label': '<abstract-title>',
103+
'paragraph': item.get('text')
104+
}
105+
stream_data.append(obj.copy())
106+
107+
next_item = content[i + 1]
108+
obj['type'] = 'paragraph_with_language'
109+
obj['value'] = {
110+
'label': '<abstract>',
111+
'paragraph': next_item.get('text'),
112+
'language': langid.classify(next_item.get('text'))[0] or None
113+
}
114+
stream_data.append(obj.copy())
115+
116+
elif item.get('type') == '<kwd-group>':
117+
keywords = extract_keywords(item.get('text'))
118+
obj['type'] = 'paragraph'
119+
obj['value'] = {
120+
'label': '<kwd-title>',
121+
'paragraph': keywords['title']
122+
}
123+
stream_data.append(obj.copy())
124+
125+
obj['type'] = 'paragraph_with_language'
126+
obj['value'] = {
127+
'label': '<kwd-group>',
128+
'paragraph': keywords['keywords'],
129+
'language': langid.classify(keywords['title'].replace('<italic>', '').replace('</italic>', ''))[0] or None
130+
}
131+
stream_data.append(obj.copy())
132+
133+
else:
134+
obj['type'] = 'paragraph'
135+
obj['value'] = {
136+
'label': item.get('type') ,
137+
'paragraph': item.get('text')
138+
}
139+
stream_data.append(obj.copy())
140+
continue
141+
142+
if item.get('type') == 'first_block':
143+
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
144+
145+
if get_llm_model_name() == MODEL_NAME_GEMINI:
146+
output = llm_first_block.run(LlamaInputSettings.get_first_metadata(clean_labels(item.get('text'))))
147+
match = re.search(r'\{.*\}', output, re.DOTALL)
148+
if match:
149+
output = match.group(0)
150+
output = json.loads(output)
151+
152+
if get_llm_model_name() == MODEL_NAME_LLAMA:
153+
154+
output_author = get_data_first_block(clean_labels(item.get('text')), 'author', user_id)
155+
156+
output_affiliation = get_data_first_block(clean_labels(item.get('text')), 'affiliation', user_id)
157+
158+
output_doi = get_data_first_block(clean_labels(item.get('text')), 'doi', user_id)
159+
160+
output_title = get_data_first_block(clean_labels(item.get('text')), 'title', user_id)
161+
162+
# 1. Parsear cada salida
163+
doi_section = output_doi
164+
titles = output_title
165+
authors = output_author
166+
affiliations = output_affiliation
167+
168+
# 2. Combinar en un único JSON
169+
output = {
170+
"doi": doi_section.get("doi", ""),
171+
"section": doi_section.get("section", ""),
172+
"titles": titles,
173+
"authors": authors,
174+
"affiliations": affiliations
175+
}
176+
177+
obj['type'] = 'paragraph'
178+
obj['value'] = {
179+
'label': '<article-id>',
180+
'paragraph': output['doi']
181+
}
182+
stream_data.append(obj.copy())
183+
obj['value'] = {
184+
'label': '<subject>',
185+
'paragraph': output['section']
186+
}
187+
stream_data.append(obj.copy())
188+
for i, tit in enumerate(output['titles']):
189+
obj['type'] = 'paragraph_with_language'
190+
obj['value'] = {
191+
'label': '<article-title>' if i == 0 else '<trans-title>',
192+
'paragraph': tit['title'],
193+
'language': tit['language']
194+
}
195+
stream_data.append(obj.copy())
196+
197+
for i, auth in enumerate(output['authors']):
198+
obj['type'] = 'author_paragraph'
199+
obj['value'] = {
200+
'label': '<contrib>',
201+
'surname': auth['surname'],
202+
'given_names': auth['name'],
203+
'orcid': auth['orcid'],
204+
'affid': auth['aff'],
205+
'char': auth['char']
206+
}
207+
stream_data.append(obj.copy())
208+
209+
for i, aff in enumerate(output['affiliations']):
210+
obj['type'] = 'aff_paragraph'
211+
obj['value'] = {
212+
'label': '<aff>',
213+
'affid': aff['aff'],
214+
'char': aff['char'],
215+
'orgname': aff['orgname'],
216+
'orgdiv2': aff['orgdiv2'],
217+
'orgdiv1': aff['orgdiv1'],
218+
'zipcode': aff['postal'],
219+
'city': aff['city'],
220+
'country': aff['name_country'],
221+
'code_country': aff['code_country'],
222+
'state': aff['state'],
223+
'text_aff': aff['text_aff'],
224+
#'original': aff['original']
225+
}
226+
stream_data.append(obj.copy())
70227

71228
if item.get('text') is None or item.get('text') == '':
72229
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
@@ -119,8 +276,6 @@ def get_labels(title, user_id):
119276
chunks = split_in_three(obj_reference)
120277
output=[]
121278

122-
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
123-
124279
for chunk in chunks:
125280
if len(chunk) > 0:
126281
text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '')

0 commit comments

Comments
 (0)