Merge branch 'issue-04' of github.com:eduranm/markapi into eduranm-issue-04

gitnnolabs · gitnnolabs · commit 910dbbf5d015 · 2026-05-04T17:45:55.000-03:00
diff --git a/config/api_router.py b/config/api_router.py
@@ -2,14 +2,14 @@
 from rest_framework.routers import DefaultRouter, SimpleRouter
 
 from reference.api.v1.views import ReferenceViewSet
-
-app_name = "reference"
+from markup_doc.api.v1.views import ArticleViewSet
 
 if settings.DEBUG:
     router = DefaultRouter()
 else:
     router = SimpleRouter()
 
 router.register("reference", ReferenceViewSet, basename="reference")
+router.register("first_block", ArticleViewSet, basename="first_block")
 
 urlpatterns = router.urls
diff --git a/markup_doc/api/__init__.py b/markup_doc/api/__init__.py
diff --git a/markup_doc/api/v1/__init__.py b/markup_doc/api/v1/__init__.py
diff --git a/markup_doc/api/v1/serializers.py b/markup_doc/api/v1/serializers.py
@@ -0,0 +1,7 @@
+from rest_framework import serializers
+from markup_doc.models import ArticleDocx
+
+class ArticleDocxSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = ArticleDocx
+        fields = "__all__"  
diff --git a/markup_doc/api/v1/views.py b/markup_doc/api/v1/views.py
@@ -0,0 +1,43 @@
+from django.shortcuts import render
+from django.http import JsonResponse
+from rest_framework.permissions import IsAuthenticated
+from rest_framework.viewsets import GenericViewSet
+from rest_framework.mixins import CreateModelMixin
+from rest_framework.response import Response
+from markup_doc.api.v1.serializers import ArticleDocxSerializer
+from markup_doc.marker import mark_article
+
+import json
+
+# Create your views here.
+
+class ArticleViewSet(
+    GenericViewSet,  # generic view functionality
+    CreateModelMixin,  # handles POSTs
+):
+    serializer_class = ArticleDocxSerializer
+    permission_classes = [IsAuthenticated]
+    http_method_names = [
+        "post",
+    ]
+
+    def create(self, request, *args, **kwargs):
+        return self.api_article(request)
+
+    def api_article(self, request):
+        try:
+            data = json.loads(request.body)
+            post_text = data.get('text')  # Obtiene el parámetro
+            post_metadata = data.get('metadata')  # Obtiene el parámetro
+
+            resp_data = mark_article(post_text, post_metadata)
+
+            response_data = {
+                'message': resp_data,
+            }
+        except json.JSONDecodeError:
+            response_data = {
+                'error': 'Error processing'
+            }
+
+        return JsonResponse(response_data)
diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
@@ -780,3 +780,59 @@ def create_labeled_object2(i, item, state, sections):
         }
 
     return obj, result, state
+
+
+def get_data_first_block(text, metadata, user_id):
+    payload = {
+        'text': text,
+        'metadata': metadata
+    }
+
+    model = LlamaModel.objects.first()
+
+    if model.name_file:
+        user = User.objects.get(pk=user_id)
+        refresh = RefreshToken.for_user(user)
+        access_token = refresh.access_token
+
+        # FIXME: Hardcoded URL
+        url = "http://django:8000/api/v1/first_block/"    
+
+    headers = {
+        'Authorization': f'Bearer {access_token}',
+        'Content-Type': 'application/json'
+    }
+
+    response = requests.post(url, json=payload, headers=headers)
+
+    if response.status_code == 200:
+        response_json = response.json()
+        message_str = response_json['message']
+
+        resp_json = json.loads(message_str)
+        
+    return resp_json
+
+
+def extract_keywords(text):
+    # Quitar punto final si existe
+    text = text.strip()
+    if text.endswith('.'):
+        text = text[:-1].strip()
+
+    # Ver si contiene una etiqueta con dos puntos
+    match = re.match(r'(?i)\s*(.+?)\s*:\s*(.+)', text)
+    
+    if match:
+        label = match.group(1).strip()
+        content = match.group(2).strip()
+    else:
+        label = None
+        content = text
+
+    # Separar por punto y coma o coma
+    keywords = re.split(r'\s*[;,]\s*', content)
+    clean_keywords = [p.strip() for p in keywords if p.strip()]
+    clean_keywords = ", ".join(keywords)
+
+    return {"title": label, "keywords": clean_keywords}
diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
@@ -5,23 +5,45 @@
 import json
 import re
 
+# Third-party imports
+import langid
+
 from markup_doc.models import UploadDocx
 from markup_doc.labeling_utils import (
     split_in_three,
     process_reference,
     process_references,
+    extract_keywords,
     create_labeled_object2,
+    get_data_first_block,
     get_llm_model_name
 )
 
 from markup_doc.models import ProcessStatus
 from markup_doc.labeling_utils import MODEL_NAME_GEMINI, MODEL_NAME_LLAMA
 from markuplib.function_docx import functionsDocx
-from model_ai.llama import LlamaService
+from model_ai.llama import LlamaService, LlamaInputSettings
 from reference.config_gemini import create_prompt_reference
 from markup_doc.sync_api import sync_journals_from_api
 
 
+def clean_labels(text):
+    # Eliminar etiquetas tipo [kwd] o [sectitle], incluso si tienen espacios como [/ doctitle ]
+    text = re.sub(r'\[\s*/?\s*\w+(?:\s+[^\]]+)?\s*\]', '', text)
+
+    # Reemplazar múltiples espacios por uno solo
+    text = re.sub(r'[ \t]+', ' ', text)
+
+    # Eliminar espacios antes de los signos de puntuación
+    text = re.sub(r'\s+([;:,.])', r'\1', text)
+
+    # Normalizar múltiples saltos de línea
+    text = re.sub(r'\n+', '\n', text)
+
+    # Quitar espacios al principio y final
+    return text.strip()
+    
+
 @celery_app.task()
 def task_sync_journals_from_api():
     sync_journals_from_api()
@@ -67,6 +89,141 @@ def get_labels(title, user_id):
             continue
 
         obj = {}
+        if item.get('type') in [
+                                    '<abstract>', 
+                                    '<date-accepted>', 
+                                    '<date-received>',
+                                    '<kwd-group>'
+                                    ]:
+            if item.get('type') == '<abstract>':
+                if i + 1 < len(content):
+                    obj['type'] = 'paragraph'
+                    obj['value'] = {
+                        'label': '<abstract-title>',
+                        'paragraph': item.get('text')
+                    }
+                    stream_data.append(obj.copy())
+
+                    next_item = content[i + 1]
+                    obj['type'] = 'paragraph_with_language'
+                    obj['value'] = {
+                        'label': '<abstract>',
+                        'paragraph': next_item.get('text'),
+                        'language': langid.classify(next_item.get('text'))[0] or None
+                    }
+                    stream_data.append(obj.copy())
+            
+            elif item.get('type') == '<kwd-group>':
+                keywords = extract_keywords(item.get('text'))
+                obj['type'] = 'paragraph'
+                obj['value'] = {
+                        'label': '<kwd-title>',
+                        'paragraph': keywords['title']
+                    }
+                stream_data.append(obj.copy())
+
+                obj['type'] = 'paragraph_with_language'
+                obj['value'] = {
+                        'label': '<kwd-group>',
+                        'paragraph': keywords['keywords'],
+                        'language': langid.classify(keywords['title'].replace('<italic>', '').replace('</italic>', ''))[0] or None
+                    }
+                stream_data.append(obj.copy())
+
+            else:        
+                obj['type'] = 'paragraph'
+                obj['value'] = {
+                    'label': item.get('type') ,
+                    'paragraph': item.get('text')
+                }
+                stream_data.append(obj.copy())
+            continue
+
+        if item.get('type') == 'first_block':
+            llm_first_block = LlamaService(mode='prompt', temperature=0.1)
+
+            if get_llm_model_name() == MODEL_NAME_GEMINI:
+                output = llm_first_block.run(LlamaInputSettings.get_first_metadata(clean_labels(item.get('text'))))
+                match = re.search(r'\{.*\}', output, re.DOTALL)
+                if match:
+                    output = match.group(0)
+                    output = json.loads(output)
+
+            if get_llm_model_name() == MODEL_NAME_LLAMA:
+
+                output_author = get_data_first_block(clean_labels(item.get('text')), 'author', user_id)
+                
+                output_affiliation = get_data_first_block(clean_labels(item.get('text')), 'affiliation', user_id)
+                
+                output_doi = get_data_first_block(clean_labels(item.get('text')), 'doi', user_id)
+                
+                output_title = get_data_first_block(clean_labels(item.get('text')), 'title', user_id)
+
+                # 1. Parsear cada salida
+                doi_section = output_doi
+                titles = output_title
+                authors = output_author
+                affiliations = output_affiliation
+
+                # 2. Combinar en un único JSON
+                output = {
+                    "doi": doi_section.get("doi", ""),
+                    "section": doi_section.get("section", ""),
+                    "titles": titles,
+                    "authors": authors,
+                    "affiliations": affiliations
+                }
+
+            obj['type'] = 'paragraph'
+            obj['value'] = {
+                'label': '<article-id>',
+                'paragraph': output['doi']
+            }
+            stream_data.append(obj.copy())
+            obj['value'] = {
+                'label': '<subject>',
+                'paragraph': output['section']
+            }
+            stream_data.append(obj.copy())
+            for i, tit in enumerate(output['titles']):
+                obj['type'] = 'paragraph_with_language'
+                obj['value'] = {
+                    'label': '<article-title>' if i == 0 else '<trans-title>',
+                    'paragraph': tit['title'],
+                    'language': tit['language']
+                }
+                stream_data.append(obj.copy())
+
+            for i, auth in enumerate(output['authors']):
+                obj['type'] = 'author_paragraph'
+                obj['value'] = {
+                    'label': '<contrib>',
+                    'surname': auth['surname'],
+                    'given_names': auth['name'],
+                    'orcid': auth['orcid'],
+                    'affid': auth['aff'],
+                    'char': auth['char']
+                }
+                stream_data.append(obj.copy())
+
+            for i, aff in enumerate(output['affiliations']):
+                obj['type'] = 'aff_paragraph'
+                obj['value'] = {
+                    'label': '<aff>',
+                    'affid': aff['aff'],
+                    'char': aff['char'],
+                    'orgname': aff['orgname'],
+                    'orgdiv2': aff['orgdiv2'],
+                    'orgdiv1': aff['orgdiv1'],
+                    'zipcode': aff['postal'],
+                    'city': aff['city'],
+                    'country': aff['name_country'],
+                    'code_country': aff['code_country'],
+                    'state': aff['state'],
+                    'text_aff': aff['text_aff'],
+                    #'original': aff['original']
+                }
+                stream_data.append(obj.copy())
 
         if item.get('text') is None or item.get('text') == '':
             state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
@@ -119,8 +276,6 @@ def get_labels(title, user_id):
         chunks = split_in_three(obj_reference)
         output=[]
 
-        llm_first_block = LlamaService(mode='prompt', temperature=0.1)
-
         for chunk in chunks:
             if len(chunk) > 0:
                 text_references = "\n".join([item["text"] for item in chunk]).replace('<italic>', '').replace('</italic>', '')