-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Expand file tree
/
Copy pathts_vecto_util.py
More file actions
88 lines (69 loc) · 2.22 KB
/
ts_vecto_util.py
File metadata and controls
88 lines (69 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding=utf-8
"""
@project: maxkb
@Author:虎
@file: ts_vecto_util.py
@date:2024/4/16 15:26
@desc:
"""
import re
import uuid
from typing import List
import jieba
import jieba.posseg
jieba_word_list_cache = [chr(item) for item in range(38, 84)]
for jieba_word in jieba_word_list_cache:
jieba.add_word('#' + jieba_word + '#')
# r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b",
# 某些不分词数据
# r'"([^"]*)"'
word_pattern_list = [r"v\d+.\d+.\d+",
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./'
jieba_remove_flag_list = ['x', 'w']
def get_word_list(text: str):
result = []
for pattern in word_pattern_list:
word_list = re.findall(pattern, text)
for child_list in word_list:
for word in child_list if isinstance(child_list, tuple) else [child_list]:
# 不能有: 所以再使用: 进行分割
if word.__contains__(':'):
item_list = word.split(":")
for w in item_list:
result.append(w)
else:
result.append(word)
return result
def replace_word(word_dict, text: str):
for key in word_dict:
pattern = '(?<!#)' + re.escape(word_dict[key]) + '(?!#)'
text = re.sub(pattern, key, text)
return text
def get_word_key(text: str, use_word_list):
j_word = next((j for j in jieba_word_list_cache if j not in text and all(j not in used for used in use_word_list)),
None)
if j_word:
return j_word
j_word = str(uuid.uuid1())
jieba.add_word(j_word)
return j_word
def to_word_dict(word_list: List, text: str):
word_dict = {}
for word in word_list:
key = get_word_key(text, set(word_dict))
word_dict['#' + key + '#'] = word
return word_dict
def get_key_by_word_dict(key, word_dict):
v = word_dict.get(key)
if v is None:
return key
return v
def to_ts_vector(text: str):
# 分词
result = jieba.lcut(text)
return " ".join(result)
def to_query(text: str):
extract_tags = jieba.lcut(text)
result = " ".join(extract_tags)
return result