Skip to content

Commit 7a04334

Browse files
committed
Merge branch 'eduranm-issue-03'
2 parents 585d168 + b654b8b commit 7a04334

9 files changed

Lines changed: 3489 additions & 12 deletions

File tree

config/settings/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
"xml_manager",
8484
"model_ai",
8585
"markup_doc",
86+
"markuplib",
8687
]
8788

8889
INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS + WAGTAIL

markup_doc/labeling_utils.py

Lines changed: 782 additions & 0 deletions
Large diffs are not rendered by default.

markup_doc/marker.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Standard library imports
2+
import re
3+
4+
# Local application imports
5+
from model_ai.llama import LlamaService, LlamaInputSettings
6+
7+
8+
def mark_article(text, metadata):
9+
if metadata == 'author':
10+
messages, response_format = LlamaInputSettings.get_author_config()
11+
if metadata == 'affiliation':
12+
messages, response_format = LlamaInputSettings.get_affiliations()
13+
if metadata == 'doi':
14+
messages, response_format = LlamaInputSettings.get_doi_and_section()
15+
if metadata == 'title':
16+
messages, response_format = LlamaInputSettings.get_titles()
17+
18+
gll = LlamaService(messages, response_format)
19+
output = gll.run(text)
20+
output = output['choices'][0]['message']['content']
21+
if metadata == 'doi':
22+
output = re.search(r'\{.*\}', output, re.DOTALL)
23+
else:
24+
output = re.search(r'\[.*\]', output, re.DOTALL)
25+
if output:
26+
output = output.group(0)
27+
return output
28+
29+
def mark_reference(reference_text):
30+
messages, response_format = LlamaInputSettings.get_messages_and_response_format_for_reference(reference_text)
31+
reference_marker = LlamaService(messages, response_format)
32+
output = reference_marker.run(reference_text)
33+
34+
for item in output["choices"]:
35+
yield item["message"]["content"]
36+
37+
38+
def mark_references(reference_block):
39+
for ref_row in reference_block.split("\n"):
40+
ref_row = ref_row.strip()
41+
if ref_row:
42+
choices = mark_reference(ref_row)
43+
yield {
44+
"reference": ref_row,
45+
"choices": list(choices)
46+
}

markup_doc/tasks.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,145 @@
11
# Local application imports
22
from config import celery_app
33

4+
# Standard library imports
5+
import json
6+
import re
7+
8+
from markup_doc.models import UploadDocx
9+
from markup_doc.labeling_utils import (
10+
split_in_three,
11+
process_reference,
12+
process_references,
13+
create_labeled_object2,
14+
get_llm_model_name
15+
)
16+
17+
from markup_doc.models import ProcessStatus
18+
from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA
19+
from markuplib.function_docx import functionsDocx
20+
from model_ai.llama import LlamaService
21+
from reference.config_gemini import create_prompt_reference
422
from markup_doc.sync_api import sync_journals_from_api
523

624

725
@celery_app.task()
826
def task_sync_journals_from_api():
927
sync_journals_from_api()
28+
29+
30+
@celery_app.task()
31+
def get_labels(title, user_id):
32+
article_docx = UploadDocx.objects.get(title=title)
33+
doc = functionsDocx.openDocx(article_docx.file.path)
34+
sections, content = functionsDocx().extractContent(doc, article_docx.file.path)
35+
article_docx_markup = article_docx
36+
text_title = ''
37+
text_paragraph = ''
38+
stream_data = []
39+
stream_data_body = []
40+
stream_data_back = []
41+
num_ref=0
42+
state = {
43+
'label': None,
44+
'label_next': None,
45+
'label_next_reset': None,
46+
'reset': False,
47+
'repeat': None,
48+
'body_trans': False,
49+
'body': False,
50+
'back': False,
51+
'references': False
52+
}
53+
counts = {
54+
'numref': 0,
55+
'numtab': 0,
56+
'numfig': 0,
57+
'numeq': 0
58+
}
59+
60+
next_item = None
61+
obj_reference = []
62+
llama_model = False
63+
64+
for i, item in enumerate(content):
65+
if next_item:
66+
next_item = None
67+
continue
68+
69+
obj = {}
70+
71+
if item.get('text') is None or item.get('text') == '':
72+
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
73+
if state['back']:
74+
state['back'] = False
75+
state['body'] = False
76+
state['references'] = True
77+
else:
78+
79+
obj, result, state = create_labeled_object2(i, item, state, sections)
80+
81+
if result:
82+
if item.get('text').lower() in ['introducción', 'introduction', 'introdução'] and state['references']:
83+
state['body_trans'] = True
84+
obj_trans = {
85+
'type': 'paragraph_with_language',
86+
'value': {
87+
'label': '<translate-body>',
88+
'paragraph': 'Translate'
89+
}
90+
}
91+
stream_data_body.append(obj_trans)
92+
if state['body']:
93+
if state['references']:
94+
if state['body_trans']:
95+
stream_data_body.append(obj)
96+
else:
97+
stream_data.append(obj)
98+
else:
99+
stream_data_body.append(obj)
100+
elif state['back']:
101+
if state['label'] == '<sec>':
102+
stream_data_back.append(obj)
103+
if state['label'] == '<p>':
104+
num_ref = num_ref + 1
105+
#obj = {}#process_reference(num_ref, obj, user_id)
106+
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
107+
#stream_data_back.append(obj)
108+
else:
109+
stream_data.append(obj)
110+
111+
num_refs = [item["num_ref"] for item in obj_reference]
112+
113+
if get_llm_model_name() == 'LLAMA':
114+
for obj_ref in obj_reference:
115+
obj = process_reference(obj_ref['num_ref'], obj_ref['obj'], user_id)
116+
stream_data_back.append(obj)
117+
118+
else:
119+
chunks = split_in_three(obj_reference)
120+
output=[]
121+
122+
llm_first_block = LlamaService(mode='prompt', temperature=0.1)
123+
124+
for chunk in chunks:
125+
if len(chunk) > 0:
126+
text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '')
127+
prompt_reference = create_prompt_reference(text_references)
128+
129+
result = llm_first_block.run(prompt_reference)
130+
131+
match = re.search(r'\[.*\]', result, re.DOTALL)
132+
if match:
133+
parsed = json.loads(match.group(0))
134+
output.extend(parsed) # Agrega a la lista de salida
135+
136+
stream_data_back.extend(process_references(num_refs, output))
137+
138+
article_docx_markup.content = stream_data
139+
article_docx_markup.content_body = stream_data_body
140+
article_docx_markup.content_back = stream_data_back
141+
article_docx_markup.save()
142+
143+
article_docx.estatus = ProcessStatus.PROCESSED
144+
article_docx.save()
145+

markup_doc/wagtail_hooks.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,45 @@
11
from django.http import HttpResponseRedirect
2-
from django.template.response import TemplateResponse
32
from django.utils.translation import gettext_lazy as _
4-
from wagtail.admin import messages
5-
from wagtail.snippets.models import register_snippet
3+
from django.contrib import messages
4+
from django.template.response import TemplateResponse
5+
from wagtail_modeladmin.options import ModelAdmin
6+
67
from wagtail.snippets.views.snippets import (
78
CreateView,
89
EditView,
910
SnippetViewSet,
10-
SnippetViewSetGroup,
11+
SnippetViewSetGroup
1112
)
12-
from wagtail_modeladmin.options import ModelAdmin
1313

14-
from markup_doc.models import (
14+
from markup_doc.models import (
1515
ArticleDocx,
1616
ArticleDocxMarkup,
17+
UploadDocx,
18+
MarkupXML,
1719
CollectionModel,
1820
JournalModel,
19-
MarkupXML,
20-
ProcessStatus,
21-
UploadDocx,
21+
ProcessStatus
2222
)
23-
from markup_doc.sync_api import sync_collection_from_api
24-
from markup_doc.tasks import task_sync_journals_from_api
23+
24+
from config.menu import get_menu_order
25+
from markup_doc.tasks import get_labels, task_sync_journals_from_api
26+
from django.urls import path, reverse
27+
from django.utils.html import format_html
28+
from wagtail.admin import messages
29+
from wagtail.admin.views import generic
30+
31+
from django.shortcuts import redirect, get_object_or_404
32+
from django.views import View
33+
34+
from wagtail.snippets.models import register_snippet
35+
from django.db.models.signals import post_save
36+
from django.dispatch import receiver
37+
from django.db import transaction
38+
39+
from wagtail import hooks
40+
from django.templatetags.static import static
41+
from markup_doc.sync_api import sync_collection_from_api, sync_journals_from_api
42+
2543

2644

2745
class ArticleDocxCreateView(CreateView):
@@ -41,6 +59,7 @@ def form_valid(self, form):
4159
self.object = form.save_all(self.request.user)
4260
self.object.estatus = ProcessStatus.PROCESSING
4361
self.object.save()
62+
transaction.on_commit(lambda: get_labels.delay(self.object.title, self.request.user.id))
4463
return HttpResponseRedirect(self.get_success_url())
4564

4665

markuplib/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)