Data-extraction-/data_extraction.py at main · vivek152oo3/Data-extraction- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
# -*- coding: utf-8 -*-
"""data_extraction.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rylPvDzYHJQvqn2iFggWEyXb2YSaEBg1
"""

import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

pip install nltk

import os

!pip install syllables

!pip install textstat

import syllables
import textstat
import math

from nltk.corpus import wordnet

raw = pd.read_csv('https://docs.google.com/spreadsheets/d/1XJIkYI7439sZbXeiEWzM7NboYKYOaujW/export?format=csv')

raw.head()

df = raw.head(100)

url = df['URL']

url

import requests


import requests
from bs4 import BeautifulSoup

def scraping_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    Title = soup.select_one('h1.entry-title, h1.tdb-title-text')

    if Title is not None:
        return Title.text.strip()
    else:
        return "Title not found"

scraping_title(url[86])

title_list = []
for a in url:
  title_list.append(scraping_title(a))
title_list

for i, title in enumerate(title_list):
  if title == "no title":
    print(i+9)

def scraping_text(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  Text = soup.find('div', class_ = 'td-post-content tagdiv-type')
  # print(Text)
  if Text:
    text = ""
    for data in Text:
      text += Text.get_text(strip = True)
  else:
    text = "no text"
  print(text)

scraping_text(url[75])

def scraping_data(url, i):
    response = requests.get(url)
    all_text = BeautifulSoup(response.text, 'html.parser')

    Title = all_text.select_one('h1.entry-title , h1.tdb-title-text')
    title = Title.get_text(strip=True) if Title else "No Title"

    Text = all_text.find('div', class_='td-post-content tagdiv-type')
    text = Text.get_text(strip=True) if Text else "No Text"

    folder_name = "Output_scraped_data"
    os.makedirs(folder_name, exist_ok=True)
    file_name = os.path.join(folder_name, f"blackassign{i}")

    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n")
        file.write(text)

for i, row in enumerate(url):
  scraping_data(row, i+ 1)

# import shutil

# # Specify the path to the folder you want to delete
# folder_path = '/content/Output_scraped_data'  # Replace with the actual path

# # Use shutil.rmtree to delete the folder and its contents
# shutil.rmtree(folder_path)

# # Optionally, check if the folder still exists
# if not os.path.exists(folder_path):
#     print(f"The folder {folder_path} has been successfully deleted.")
# else:
#     print(f"Failed to delete the folder {folder_path}.")


Columns = ['URL_ID','URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','PERSONAL PRONOUNS','AVG WORD LENGHT']
df = pd.DataFrame(columns = Columns)

url_id = raw['URL_ID'].head(20)
df['URL_ID'] = url_id
df['URL'] = url
df

with open('positive-words.txt', 'r') as file:
  positive_words_text = file.read()
with open('negative-words.txt','r',encoding = 'latin-1') as file:
  negative_words_text = file.read()
positive_words = positive_words_text.split()
negative_words = negative_words_text.split()
print(positive_words)
print(negative_words)

def calculate_scores(text):
  positive_score = sum(word in positive_words for word in text)
  negative_score = sum(word in negative_words for word in text)
  polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
  return positive_score, negative_score, polarity_score

score_dataframe = pd.DataFrame(columns = ['file','positive score','negative score', 'polarity score','total word count', 'total sentence count'])
score_dataframe

import os
import pandas as pd
folder_path = '/content/Output_scraped_data'
for file in os.listdir(folder_path):
  file_path = os.path.join(folder_path,file)
  file_part = os.path.splitext(file)[0]
  with open(file_path,'r') as raw_data:
    text = raw_data.read()
  sentences = sent_tokenize(text)
  number_of_sentences = len(sentences)
  text = text.split()
  word_count = len(text)
  positive_score, negative_score, polarity_score = calculate_scores(text)
  score_dataframe = score_dataframe.append({'file':file_part,'positive score':positive_score,'negative score':negative_score,'polarity score':polarity_score,'total word count':word_count,'total sentence count':number_of_sentences},ignore_index = True)

score_dataframe

score_dataframe.drop_duplicates()

score_dataframe.sort_values(by = 'file', ascending = True)

def natural_sort_key(file_name):
    parts = re.split(r'(\d+)', file_name)
    return [int(part) if part.isdigit() else part.lower() for part in parts]

score_dataframe = score_dataframe.sort_values(by='file', key=lambda x: x.map(natural_sort_key))

score_dataframe.head(100)

df['POSITIVE SCORE'],df['NEGATIVE SCORE'], df['POLARITY SCORE'] = score_dataframe['positive score'], score_dataframe['negative score'], score_dataframe['polarity score']

df

df.head()

sentence_countdf = pd.DataFrame(columns = ['file', 'sentence count'])
sentence_countdf

def sentence_count(file_path):
  with open(file_path, 'r') as files:
    texts = files.read()
  sentences = sent_tokenize(texts)
  number_of_sentences = len(sentences)
  return number_of_sentences

for file in os.listdir(folder_path):
  filepath = os.path.join(folder_path, file)
  filepart = os.path.splitext(file)[0]
  sentence_number = sentence_count(filepath)
  sentence_countdf = sentence_countdf.append({'file':filepart, 'sentence count': sentence_number},ignore_index = True)

sentence_countdf = sentence_countdf.reset_index(drop = True)

sentence_countdf

sentence_countdf.head(20)

sentence_countdf['subejctivity_score'] = None
sentence_countdf

df.head(10)

score_dataframe['total sentence count'] = sentence_countdf['sentence count']

score_dataframe = score_dataframe.drop_duplicates(subset = ['file'],keep = 'first')
score_dataframe = score_dataframe.reset_index(drop = True)

score_dataframe['total sentence count'] = sentence_countdf['sentence count']

score_dataframe['subjectivity score'] = (score_dataframe['positive score'] + score_dataframe['negative score'])/ (score_dataframe['total word count'] + 0.000001)
score_dataframe['average sentence lenght'] = score_dataframe['total word count']/score_dataframe['total sentence count']
score_dataframe['average sentence lenght'] = score_dataframe['average sentence lenght'].astype(int)

score_dataframe.head(10)

with open('/content/Output_scraped_data/blackassign6','r') as file:
  text = file.read()

syllable_count =  textstat.syllable_count(text)
print(syllable_count)

nltk.download('wordnet')

tempdf = pd.DataFrame(columns = ['file','complex word count'])
tempdf

def is_complex(word):
    synsets = wordnet.synsets(word)
    return len(synsets) > 0

def count_complex_words(text):
    words = nltk.word_tokenize(text)
    complex_words_count = sum(1 for word in words if is_complex(word))
    return complex_words_count

folder = "/content/Output_scraped_data/"

for file in os.listdir(folder):
  file_path = os.path.join(folder,file)
  file_part = os.path.splitext(file)[0]
  with open(file_path, 'r', encoding='utf-8') as file_content:
    text = file_content.read()
  complex_words_count = count_complex_words(text)
  tempdf = tempdf.append({'file':file_part, 'complex word count': complex_words_count},ignore_index = True)
  print("Count of complex words:", complex_words_count)

tempdf.head(30)

tempdf = tempdf.sort_values(by='file', key=lambda x: x.map(natural_sort_key))

tempdf.head(50)

df.head()

score_dataframe.head(80)

score_dataframe['complex word count'] = tempdf['complex word count']

score_dataframe.head()

score_dataframe['percentage of complex words'] = (score_dataframe['complex word count']/score_dataframe['total word count'])*100

score_dataframe.head()

score_dataframe['fog index'] = 0.4*(score_dataframe['average sentence lenght']+score_dataframe['percentage of complex words'])
score_dataframe.head()

# Initializing an empty DataFrame to store results
score_df = pd.DataFrame(columns=['File_Name', 'Syllable_Count', 'Personal_Pronoun_Count'])

# Processing each text file
for file_name in file_names:
    # Creating the full file path
    file_path = os.path.join(folder_path, file_name)

    # Reading the content of the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()

        # Counting syllables in the text
        syllable_counts = count_syllables_in_text(text_content)

        # Counting personal pronouns in the text
        pronoun_count = count_personal_pronouns(text_content)

        # Saving the results in the DataFrame
        file_data = {'File_Name': file_name, 'Syllable_Count': sum(syllable_counts), 'Personal_Pronoun_Count': pronoun_count}
        score_df = score_df.append(file_data, ignore_index=True)

score_df = score_df.sort_values(by='File_Name', key=lambda x: x.map(natural_sort_key))

score_df.head()

score_dataframe['syllable count'] = score_df['Syllable_Count']
score_dataframe['personal pronoun count'] = score_df['Personal_Pronoun_Count']
score_dataframe.head()

print(score_dataframe.columns)

print(score_dataframe.head())

df['POSITIVE SCORE'] = score_dataframe['positive score']
df['NEGATIVE SCORE'] = score_dataframe['negative score']
df['POLARITY SCORE'] = score_dataframe['polarity score']
df['AVG SENTENCE LENGTH'] = score_dataframe['average sentence lenght']
df['SUBJECTIVITY SCORE'] = score_dataframe['subjectivity score']
df['FOG INDEX'] = score_dataframe['fog index']
df['SYLLABLE PER WORD'] = score_dataframe['syllable count']
df['AVG NUMER OF WORDS PER SENTENCE'] = score_dataframe['average sentence lenght']
df['PERCENTAGE OF COMPLEX WORDS'] = score_dataframe['percentage of complex words']
df['COMPLEX WORD COUNT'] = score_dataframe['complex word count']
df['WORD COUNT'] = score_dataframe['total word count']
df['PERSONAL PRONOUNS'] = score_dataframe['personal pronoun count']

df.head()

df = df.loc[:, ~df.columns.duplicated()]

df.rename(columns = {'AVG NUMER OF WORDS PER SENTENCE':'AVG NUMBER OF WORDS PER SENTENCE'},inplace = True)

df.head

def character_count(text):
  filtered_text = re.sub(r'[^a-zA-Z0-9]', '', text)
  character_count = len(filtered_text)
  return character_count
temp2df = pd.DataFrame(columns = ['file','character count'])
for file in os.listdir(folder):
  file_path = os.path.join(folder, file)
  file_part = os.path.splitext(file)[0]
  with open(file_path,'r') as raw:
    text = raw.read()
  characters = character_count(text)
  temp2df = temp2df.append({'file': file_part, 'character count': characters}, ignore_index = True)

temp2df = temp2df.sort_values(by='file', key=lambda x: x.map(natural_sort_key))

temp2df = temp2df.sort_values(by='file', key=lambda x: x.map(natural_sort_key))
temp2df.head()

score_dataframe['character count'] = temp2df['character count']

score_dataframe.head()

print(score_dataframe.head())

score_dataframe['character count'] = temp2df['character count']

score_dataframe['average word length'] = (score_dataframe['character count']/score_dataframe['total word count']).apply(math.floor)

score_dataframe.head()

df['AVG WORD LENGHT'] = score_dataframe['average word length']

df.head()

df.to_csv('/content/Output_reordered.csv', index=False)

df = df.rename({'AVG WORD LENGHT':'AVG WORD LENGTH'})

df.head()

df.to_csv('output_file.csv', index=False)