Semantic-Comment-Analyze/nlp_engine.py at main · IDKHowToCodeFR/Semantic-Comment-Analyze · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""NLP engine for semantic analysis using transformer models."""

from typing import Dict, Any
from functools import lru_cache

import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

INTENT_LABELS = [
    "Bug Report",
    "Feature Request",
    "Question",
    "Praise",
    "Complaint",
    "General Feedback"
]


@lru_cache(maxsize=1)
def load_intent_classifier() -> pipeline:
    """Load and cache BART-MNLI for zero-shot classification."""
    try:
        return pipeline(
            "zero-shot-classification",
            model="facebook/bart-large-mnli",
            device=-1
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load intent classifier: {e}")


@lru_cache(maxsize=1)
def load_embedding_model() -> SentenceTransformer:
    """Load and cache MiniLM sentence embeddings."""
    try:
        return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    except Exception as e:
        raise RuntimeError(f"Failed to load embedding model: {e}")


def classify_intent(text: str, threshold: float = 0.5) -> Dict[str, Any]:
    """Classify comment intent using zero-shot BART-MNLI."""
    if not text or not text.strip():
        raise ValueError("Input text cannot be empty")

    classifier = load_intent_classifier()
    result = classifier(text, candidate_labels=INTENT_LABELS, multi_label=False)

    filtered = [
        (label, score)
        for label, score in zip(result['labels'], result['scores'])
        if score >= threshold
    ]
    if not filtered:
        filtered = [(result['labels'][0], result['scores'][0])]

    return {
        'labels': [item[0] for item in filtered[:3]],
        'scores': [item[1] for item in filtered[:3]],
        'top_intent': filtered[0][0],
        'top_confidence': filtered[0][1]
    }


def analyze_sentiment(text: str) -> Dict[str, float]:
    """Analyze sentiment using embedding similarity to anchor phrases."""
    if not text or not text.strip():
        raise ValueError("Input text cannot be empty")

    model = load_embedding_model()
    anchors = [
        "This is excellent, amazing, and wonderful!",
        "This is terrible, awful, and horrible!",
        "This is a neutral statement without emotion."
    ]

    embeddings = model.encode([text] + anchors)
    text_emb = embeddings[0]

    similarities = [
        np.dot(text_emb, embeddings[i]) /
        (np.linalg.norm(text_emb) * np.linalg.norm(embeddings[i]))
        for i in range(1, 4)
    ]

    total = sum(similarities)
    return {
        'positive': float(similarities[0] / total),
        'negative': float(similarities[1] / total),
        'neutral': float(similarities[2] / total),
        'compound': float((similarities[0] - similarities[1]) / total)
    }


def compute_similarity(text: str, anchor: str) -> float:
    """Calculate cosine similarity between two texts."""
    if not text or not text.strip():
        raise ValueError("Input text cannot be empty")
    if not anchor or not anchor.strip():
        raise ValueError("Anchor comment cannot be empty")

    model = load_embedding_model()
    embeddings = model.encode([text, anchor])
    return float(
        np.dot(embeddings[0], embeddings[1]) /
        (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
    )