InternHub/user_context.py at main · SakuraMathcraft/InternHub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import json
import os
import pdfplumber
import jieba
import re

class UserManager:
    def __init__(self, data_path='user_data.json'):
        self.data_path = data_path
        self.bookmarks = [] # List of job_ids
        self.applications = {} # Dict of job_id: {status: 'applied', date: '...'}
        self.profile = {
            "name": "",
            "phone": "",
            "email": "",
            "school": "",
            "major": "",
            "skills": [],  # List of skill keywords
            "resume_text": ""
        }
        self.load_data()

    def load_data(self):
        if os.path.exists(self.data_path):
            try:
                with open(self.data_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    self.bookmarks = data.get('bookmarks', [])
                    self.applications = data.get('applications', {})
                    self.profile = data.get('profile', self.profile)
                    # Auto-clean historical duplicate bookmarks on startup.
                    normalized = self._normalize_bookmarks(self.bookmarks)
                    if normalized != self.bookmarks:
                        self.bookmarks = normalized
                        self.save_data()
            except Exception as e:
                print(f"Error loading user data: {e}")

    def _normalize_bookmarks(self, bookmarks):
        """Normalize bookmark IDs to strings and de-duplicate while keeping order."""
        normalized = []
        seen = set()
        for raw in bookmarks or []:
            if raw is None:
                continue
            key = str(raw).strip()
            if not key or key in seen:
                continue
            seen.add(key)
            normalized.append(key)
        return normalized

    def save_data(self):
        try:
            with open(self.data_path, 'w', encoding='utf-8') as f:
                json.dump({
                    "bookmarks": self.bookmarks,
                    "applications": self.applications,
                    "profile": self.profile
                }, f, ensure_ascii=False, indent=4)
        except Exception as e:
            print(f"Error saving user data: {e}")

    def toggle_bookmark(self, job_id):
        key = str(job_id).strip() if job_id is not None else ""
        if not key:
            return False

        self.bookmarks = self._normalize_bookmarks(self.bookmarks)

        if key in self.bookmarks:
            # Remove all occurrences defensively.
            self.bookmarks = [b for b in self.bookmarks if b != key]
            saved = False
        else:
            self.bookmarks.append(key)
            saved = True
        self.save_data()
        return saved

    def remove_bookmark(self, job_id):
        """Remove bookmark explicitly and persist, returns True if removed."""
        key = str(job_id).strip() if job_id is not None else ""
        if not key:
            return False

        before = len(self.bookmarks)
        self.bookmarks = [b for b in self._normalize_bookmarks(self.bookmarks) if b != key]
        if len(self.bookmarks) != before:
            self.save_data()
            return True
        return False

    def is_bookmarked(self, job_id):
        key = str(job_id).strip() if job_id is not None else ""
        return key in self.bookmarks

    def add_application(self, job_id, status="Applied"):
        from datetime import datetime
        self.applications[job_id] = {
            "status": status,
            "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        self.save_data()

    def is_applied(self, job_id):
        return job_id in self.applications

    def parse_resume(self, pdf_path):
        """
        Extract text from PDF and update profile with skills and info.
        """
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    extracted = page.extract_text()
                    if extracted:
                        text += extracted + "\n"

            self.profile['resume_text'] = text
            self._extract_entities(text)
            self.save_data()
            return True, "Resume parsed successfully."
        except Exception as e:
            return False, str(e)

    def _extract_entities(self, text):
        """
        Simple heuristic extraction for demo purposes.
        """
        # 1. Email
        email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
        if email_match:
            self.profile['email'] = email_match.group(0)

        # 2. Phone
        phone_match = re.search(r'1[3-9]\d{9}', text)
        if phone_match:
            self.profile['phone'] = phone_match.group(0)

        # 3. Skills (Keyword matching from a predefined list)
        common_skills = [
            'Python', 'Java', 'C++', 'SQL', 'HTML', 'CSS', 'JavaScript', 'React', 'Vue',
            'Node.js', 'Django', 'Flask', 'Spring', 'Git', 'Linux', 'Docker', 'Kubernetes',
            'Machine Learning', 'Deep Learning', 'PyTorch', 'TensorFlow', 'NLP', 'CV',
            'Office', 'Excel', 'Word', 'PPT', 'Photoshop', 'PS', 'PR', 'AE'
        ]

        found_skills = set()
        text_lower = text.lower()
        for skill in common_skills:
            if skill.lower() in text_lower:
                found_skills.add(skill)

        self.profile['skills'] = list(found_skills)

        # 4. School (Simple lookup)
        # In a real app, use a university dictionary or NER model
        if "大学" in text:
            # Try to grab the snippet around "大学"
            # Very naive extraction
            sentences = text.split('\n')
            for line in sentences:
                if '大学' in line:
                    # Clean up line and guess if it's the school name
                    # Take the part that ends with 大学
                    match = re.search(r'[\u4e00-\u9fa5]+大学', line)
                    if match:
                         self.profile['school'] = match.group(0)
                         break

    def get_recommendation_query(self):
        """
        Construct a query string from profile for Context-Aware Recommendation.
        """
        if not self.profile['skills'] and not self.profile['school']:
            return ""

        # Build terms from possibly dirty historical data and keep insertion order.
        raw_terms = []
        for skill in self.profile.get('skills', []):
            if isinstance(skill, str) and skill.strip():
                raw_terms.append(skill.strip())

        school = self.profile.get('school', '')
        if isinstance(school, str) and school.strip():
            # Split by whitespace so persisted duplicated school tokens collapse.
            raw_terms.extend([t for t in school.split() if t.strip()])

        deduped_terms = []
        seen = set()
        for term in raw_terms:
            if term not in seen:
                seen.add(term)
                deduped_terms.append(term)

        return " ".join(deduped_terms)