Skip to content

Commit 8c68f5c

Browse files
committed
A utility for checking if a text is meaningful. Will help decide if to use detected text from raw image or processed image.
1 parent a6df6fd commit 8c68f5c

1 file changed

Lines changed: 15 additions & 0 deletions

File tree

text_analysis.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,18 @@ def analyze(text: str):
3838
most_common = freq_dist.most_common(10)
3939
collocations = text.collocations()
4040
return {"length": length, "most_common": most_common, "uniques": uniques, "collocations": collocations}
41+
42+
43+
def is_meaningful_content(text: str):
44+
# If lot of single character words, which isn't even 'a' or 'i'.
45+
SINGLE_CHARACTER_PERCENTAGE_THRESHOLD = 0.5
46+
words = nltk.word_tokenize(text)
47+
single_character_count = 0
48+
for word in words:
49+
if len(word) == 1 and word.lower() not in ['a', 'i']:
50+
single_character_count += 1
51+
if single_character_count / len(words) > SINGLE_CHARACTER_PERCENTAGE_THRESHOLD:
52+
return False
53+
# If we are able to extract only page end markers, then it's an non meaningful content.
54+
# \x0c is the page end marker.
55+
return True

0 commit comments

Comments
 (0)