|
5 | 5 | import json |
6 | 6 | import re |
7 | 7 |
|
| 8 | +# Third-party imports |
| 9 | +import langid |
| 10 | + |
8 | 11 | from markup_doc.models import UploadDocx |
9 | 12 | from markup_doc.labeling_utils import ( |
10 | 13 | split_in_three, |
11 | 14 | process_reference, |
12 | 15 | process_references, |
| 16 | + extract_keywords, |
13 | 17 | create_labeled_object2, |
| 18 | + get_data_first_block, |
14 | 19 | get_llm_model_name |
15 | 20 | ) |
16 | 21 |
|
17 | 22 | from markup_doc.models import ProcessStatus |
18 | 23 | from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA |
19 | 24 | from markuplib.function_docx import functionsDocx |
20 | | -from model_ai.llama import LlamaService |
| 25 | +from model_ai.llama import LlamaService, LlamaInputSettings |
21 | 26 | from reference.config_gemini import create_prompt_reference |
22 | 27 | from markup_doc.sync_api import sync_journals_from_api |
23 | 28 |
|
24 | 29 |
|
| 30 | +def clean_labels(text): |
| 31 | + # Eliminar etiquetas tipo [kwd] o [sectitle], incluso si tienen espacios como [/ doctitle ] |
| 32 | + text = re.sub(r'\[\s*/?\s*\w+(?:\s+[^\]]+)?\s*\]', '', text) |
| 33 | + |
| 34 | + # Reemplazar múltiples espacios por uno solo |
| 35 | + text = re.sub(r'[ \t]+', ' ', text) |
| 36 | + |
| 37 | + # Eliminar espacios antes de los signos de puntuación |
| 38 | + text = re.sub(r'\s+([;:,.])', r'\1', text) |
| 39 | + |
| 40 | + # Normalizar múltiples saltos de línea |
| 41 | + text = re.sub(r'\n+', '\n', text) |
| 42 | + |
| 43 | + # Quitar espacios al principio y final |
| 44 | + return text.strip() |
| 45 | + |
| 46 | + |
25 | 47 | @celery_app.task() |
26 | 48 | def task_sync_journals_from_api(): |
27 | 49 | sync_journals_from_api() |
@@ -67,6 +89,141 @@ def get_labels(title, user_id): |
67 | 89 | continue |
68 | 90 |
|
69 | 91 | obj = {} |
| 92 | + if item.get('type') in [ |
| 93 | + '<abstract>', |
| 94 | + '<date-accepted>', |
| 95 | + '<date-received>', |
| 96 | + '<kwd-group>' |
| 97 | + ]: |
| 98 | + if item.get('type') == '<abstract>': |
| 99 | + if i + 1 < len(content): |
| 100 | + obj['type'] = 'paragraph' |
| 101 | + obj['value'] = { |
| 102 | + 'label': '<abstract-title>', |
| 103 | + 'paragraph': item.get('text') |
| 104 | + } |
| 105 | + stream_data.append(obj.copy()) |
| 106 | + |
| 107 | + next_item = content[i + 1] |
| 108 | + obj['type'] = 'paragraph_with_language' |
| 109 | + obj['value'] = { |
| 110 | + 'label': '<abstract>', |
| 111 | + 'paragraph': next_item.get('text'), |
| 112 | + 'language': langid.classify(next_item.get('text'))[0] or None |
| 113 | + } |
| 114 | + stream_data.append(obj.copy()) |
| 115 | + |
| 116 | + elif item.get('type') == '<kwd-group>': |
| 117 | + keywords = extract_keywords(item.get('text')) |
| 118 | + obj['type'] = 'paragraph' |
| 119 | + obj['value'] = { |
| 120 | + 'label': '<kwd-title>', |
| 121 | + 'paragraph': keywords['title'] |
| 122 | + } |
| 123 | + stream_data.append(obj.copy()) |
| 124 | + |
| 125 | + obj['type'] = 'paragraph_with_language' |
| 126 | + obj['value'] = { |
| 127 | + 'label': '<kwd-group>', |
| 128 | + 'paragraph': keywords['keywords'], |
| 129 | + 'language': langid.classify(keywords['title'].replace('<italic>', '').replace('</italic>', ''))[0] or None |
| 130 | + } |
| 131 | + stream_data.append(obj.copy()) |
| 132 | + |
| 133 | + else: |
| 134 | + obj['type'] = 'paragraph' |
| 135 | + obj['value'] = { |
| 136 | + 'label': item.get('type') , |
| 137 | + 'paragraph': item.get('text') |
| 138 | + } |
| 139 | + stream_data.append(obj.copy()) |
| 140 | + continue |
| 141 | + |
| 142 | + if item.get('type') == 'first_block': |
| 143 | + llm_first_block = LlamaService(mode='prompt', temperature=0.1) |
| 144 | + |
| 145 | + if get_llm_model_name() == MODEL_NAME_GEMINI: |
| 146 | + output = llm_first_block.run(LlamaInputSettings.get_first_metadata(clean_labels(item.get('text')))) |
| 147 | + match = re.search(r'\{.*\}', output, re.DOTALL) |
| 148 | + if match: |
| 149 | + output = match.group(0) |
| 150 | + output = json.loads(output) |
| 151 | + |
| 152 | + if get_llm_model_name() == MODEL_NAME_LLAMA: |
| 153 | + |
| 154 | + output_author = get_data_first_block(clean_labels(item.get('text')), 'author', user_id) |
| 155 | + |
| 156 | + output_affiliation = get_data_first_block(clean_labels(item.get('text')), 'affiliation', user_id) |
| 157 | + |
| 158 | + output_doi = get_data_first_block(clean_labels(item.get('text')), 'doi', user_id) |
| 159 | + |
| 160 | + output_title = get_data_first_block(clean_labels(item.get('text')), 'title', user_id) |
| 161 | + |
| 162 | + # 1. Parsear cada salida |
| 163 | + doi_section = output_doi |
| 164 | + titles = output_title |
| 165 | + authors = output_author |
| 166 | + affiliations = output_affiliation |
| 167 | + |
| 168 | + # 2. Combinar en un único JSON |
| 169 | + output = { |
| 170 | + "doi": doi_section.get("doi", ""), |
| 171 | + "section": doi_section.get("section", ""), |
| 172 | + "titles": titles, |
| 173 | + "authors": authors, |
| 174 | + "affiliations": affiliations |
| 175 | + } |
| 176 | + |
| 177 | + obj['type'] = 'paragraph' |
| 178 | + obj['value'] = { |
| 179 | + 'label': '<article-id>', |
| 180 | + 'paragraph': output['doi'] |
| 181 | + } |
| 182 | + stream_data.append(obj.copy()) |
| 183 | + obj['value'] = { |
| 184 | + 'label': '<subject>', |
| 185 | + 'paragraph': output['section'] |
| 186 | + } |
| 187 | + stream_data.append(obj.copy()) |
| 188 | + for i, tit in enumerate(output['titles']): |
| 189 | + obj['type'] = 'paragraph_with_language' |
| 190 | + obj['value'] = { |
| 191 | + 'label': '<article-title>' if i == 0 else '<trans-title>', |
| 192 | + 'paragraph': tit['title'], |
| 193 | + 'language': tit['language'] |
| 194 | + } |
| 195 | + stream_data.append(obj.copy()) |
| 196 | + |
| 197 | + for i, auth in enumerate(output['authors']): |
| 198 | + obj['type'] = 'author_paragraph' |
| 199 | + obj['value'] = { |
| 200 | + 'label': '<contrib>', |
| 201 | + 'surname': auth['surname'], |
| 202 | + 'given_names': auth['name'], |
| 203 | + 'orcid': auth['orcid'], |
| 204 | + 'affid': auth['aff'], |
| 205 | + 'char': auth['char'] |
| 206 | + } |
| 207 | + stream_data.append(obj.copy()) |
| 208 | + |
| 209 | + for i, aff in enumerate(output['affiliations']): |
| 210 | + obj['type'] = 'aff_paragraph' |
| 211 | + obj['value'] = { |
| 212 | + 'label': '<aff>', |
| 213 | + 'affid': aff['aff'], |
| 214 | + 'char': aff['char'], |
| 215 | + 'orgname': aff['orgname'], |
| 216 | + 'orgdiv2': aff['orgdiv2'], |
| 217 | + 'orgdiv1': aff['orgdiv1'], |
| 218 | + 'zipcode': aff['postal'], |
| 219 | + 'city': aff['city'], |
| 220 | + 'country': aff['name_country'], |
| 221 | + 'code_country': aff['code_country'], |
| 222 | + 'state': aff['state'], |
| 223 | + 'text_aff': aff['text_aff'], |
| 224 | + #'original': aff['original'] |
| 225 | + } |
| 226 | + stream_data.append(obj.copy()) |
70 | 227 |
|
71 | 228 | if item.get('text') is None or item.get('text') == '': |
72 | 229 | state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next'] |
@@ -119,8 +276,6 @@ def get_labels(title, user_id): |
119 | 276 | chunks = split_in_three(obj_reference) |
120 | 277 | output=[] |
121 | 278 |
|
122 | | - llm_first_block = LlamaService(mode='prompt', temperature=0.1) |
123 | | - |
124 | 279 | for chunk in chunks: |
125 | 280 | if len(chunk) > 0: |
126 | 281 | text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '') |
|
0 commit comments