Skip to content

Commit b3a5141

Browse files
committed
Integrar procesamiento automático de referencias en las tareas de markup_doc
1 parent adbb38d commit b3a5141

1 file changed

Lines changed: 136 additions & 0 deletions

File tree

markup_doc/tasks.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,145 @@
11
# Local application imports
22
from config import celery_app
33

4+
# Standard library imports
5+
import json
6+
import re
7+
8+
from markup_doc.models import UploadDocx
9+
from markup_doc.labeling_utils import (
10+
split_in_three,
11+
process_reference,
12+
process_references,
13+
create_labeled_object2,
14+
get_llm_model_name
15+
)
16+
17+
from markup_doc.models import ProcessStatus
18+
from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA
19+
from markuplib.function_docx import functionsDocx
20+
from model_ai.llama import LlamaService
21+
from reference.config_gemini import create_prompt_reference
422
from markup_doc.sync_api import sync_journals_from_api
523

624

725
@celery_app.task()
826
def task_sync_journals_from_api():
927
sync_journals_from_api()
28+
29+
30+
@celery_app.task()
31+
def get_labels(title, user_id):
32+
article_docx = UploadDocx.objects.get(title=title)
33+
doc = functionsDocx.openDocx(article_docx.file.path)
34+
sections, content = functionsDocx().extractContent(doc, article_docx.file.path)
35+
article_docx_markup = article_docx
36+
text_title = ''
37+
text_paragraph = ''
38+
stream_data = []
39+
stream_data_body = []
40+
stream_data_back = []
41+
num_ref=0
42+
state = {
43+
'label': None,
44+
'label_next': None,
45+
'label_next_reset': None,
46+
'reset': False,
47+
'repeat': None,
48+
'body_trans': False,
49+
'body': False,
50+
'back': False,
51+
'references': False
52+
}
53+
counts = {
54+
'numref': 0,
55+
'numtab': 0,
56+
'numfig': 0,
57+
'numeq': 0
58+
}
59+
60+
next_item = None
61+
obj_reference = []
62+
llama_model = False
63+
64+
for i, item in enumerate(content):
65+
if next_item:
66+
next_item = None
67+
continue
68+
69+
obj = {}
70+
71+
if item.get('text') is None or item.get('text') == '':
72+
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
73+
if state['back']:
74+
state['back'] = False
75+
state['body'] = False
76+
state['references'] = True
77+
else:
78+
79+
obj, result, state = create_labeled_object2(i, item, state, sections)
80+
81+
if result:
82+
if item.get('text').lower() in ['introducción', 'introduction', 'introdução'] and state['references']:
83+
state['body_trans'] = True
84+
obj_trans = {
85+
'type': 'paragraph_with_language',
86+
'value': {
87+
'label': '<translate-body>',
88+
'paragraph': 'Translate'
89+
}
90+
}
91+
stream_data_body.append(obj_trans)
92+
if state['body']:
93+
if state['references']:
94+
if state['body_trans']:
95+
stream_data_body.append(obj)
96+
else:
97+
stream_data.append(obj)
98+
else:
99+
stream_data_body.append(obj)
100+
elif state['back']:
101+
if state['label'] == '<sec>':
102+
stream_data_back.append(obj)
103+
if state['label'] == '<p>':
104+
num_ref = num_ref + 1
105+
#obj = {}#process_reference(num_ref, obj, user_id)
106+
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
107+
#stream_data_back.append(obj)
108+
else:
109+
stream_data.append(obj)
110+
111+
num_refs = [item["num_ref"] for item in obj_reference]
112+
113+
if get_llm_model_name() == 'LLAMA':
114+
for obj_ref in obj_reference:
115+
obj = process_reference(obj_ref['num_ref'], obj_ref['obj'], user_id)
116+
stream_data_back.append(obj)
117+
118+
else:
119+
chunks = split_in_three(obj_reference)
120+
output=[]
121+
122+
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
123+
124+
for chunk in chunks:
125+
if len(chunk) > 0:
126+
text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '')
127+
prompt_reference = create_prompt_reference(text_references)
128+
129+
result = llm_first_block.run(prompt_reference)
130+
131+
match = re.search(r'\[.*\]', result, re.DOTALL)
132+
if match:
133+
parsed = json.loads(match.group(0))
134+
output.extend(parsed) # Agrega a la lista de salida
135+
136+
stream_data_back.extend(process_references(num_refs, output))
137+
138+
article_docx_markup.content = stream_data
139+
article_docx_markup.content_body = stream_data_body
140+
article_docx_markup.content_back = stream_data_back
141+
article_docx_markup.save()
142+
143+
article_docx.estatus = ProcessStatus.PROCESSED
144+
article_docx.save()
145+

0 commit comments

Comments
 (0)