-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
31 lines (22 loc) · 1 KB
/
extract.py
File metadata and controls
31 lines (22 loc) · 1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
def preprocess_document(file_path, encoding='latin1'):
df = pd.read_csv(file_path, encoding=encoding)
# Print the column names for debugging
print("Columns in the CSV file:", df.columns)
# Extract text data
texts = df[['Digraphs 1', 'Digraphs 2', 'Digraphs 3']].fillna('').values
combined_texts = [' '.join(text) for text in texts]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_texts)
sequences = tokenizer.texts_to_sequences(combined_texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=100)
# Print the shape of the data
print(f'Shape of padded sequences: {data.shape}')
return data, word_index
# Example usage
file_path = 'Digraphs.csv'
X, word_index = preprocess_document(file_path)
print(f'Padded sequences:\n{X}')