-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPositionalPosting.py
More file actions
82 lines (65 loc) · 3.39 KB
/
PositionalPosting.py
File metadata and controls
82 lines (65 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
import pandas as pd
import ast
from Preprocess import get_data_frame_after_preprocess
DF_WITH_STOP_WORDS = "./df_after_preprocess_with_stop_words.xlsx"
DF_WITHOUT_STOP_WORDS = "./df_after_preprocess_without_stop_words.xlsx"
POSITIONAL_POSTINGS_LIST_FILE_WITH_STOP_WORDS = "./positional_postings_lists_with_stop_words.json"
POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS = "./positional_postings_lists_without_stop_words.json"
positional_postings_lists = {}
class PositionalPosting:
def __init__(self):
self.frequency_in_all_documents = 0
self.document_index_dict = {}
self.document_frequency_dict = {}
self.unique_documents_frequency = 0
def toJson(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
# def load_df_after_preprocess(file_name):
# data_frame = pd.read_excel(file_name, sheet_name='Sheet1', usecols="B,C,D")
# return data_frame
def update_positional_postings_list(doc_id, tokens_in_document):
for token, indexes in tokens_in_document.items():
if token in positional_postings_lists:
positional_posting = positional_postings_lists[token]
positional_posting.frequency_in_all_documents += len(indexes)
positional_posting.document_index_dict[doc_id] = indexes
positional_posting.document_frequency_dict[doc_id] = len(indexes)
positional_posting.unique_documents_frequency += 1
else:
new_positional_posting = PositionalPosting()
new_positional_posting.frequency_in_all_documents += len(indexes)
new_positional_posting.document_index_dict[doc_id] = indexes
new_positional_posting.document_frequency_dict[doc_id] = len(indexes)
new_positional_posting.unique_documents_frequency += 1
positional_postings_lists[token] = new_positional_posting
def create_positional_postings_lists(data_frame):
for doc_id, row in data_frame.iterrows():
tokens_in_document = row['tokens']
update_positional_postings_list(doc_id, tokens_in_document)
return positional_postings_lists
def save_positional_postings_lists():
positional_postings_lists_json = {}
for term, postings_list in positional_postings_lists.items():
# print(term)
positional_postings_lists_json[term] = postings_list.__dict__
# print(positional_postings_lists_json)
with open(POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS, 'w', encoding='utf-8') as fp:
json.dump(positional_postings_lists_json, fp, sort_keys=True, indent=4, ensure_ascii=False)
def load_positional_postings_list(file_name):
with open(file_name, 'r', encoding='utf-8') as fp:
positional_postings_lists_json = json.load(fp)
# print(positional_postings_list_json)
global positional_postings_lists
positional_postings_lists = positional_postings_lists_json
# print(type(positional_postings_list))
return positional_postings_lists
if __name__ == "__main__":
print("in p p")
# data_frame_after_preprocess = read_data_from_file(NEW_FILE_NAME)
# print(type(data_frame_after_preprocess['tokens'][0]))
data_frame_after_preprocess = get_data_frame_after_preprocess("positional", True, True)
print(data_frame_after_preprocess)
create_positional_postings_lists(data_frame_after_preprocess)
save_positional_postings_lists()