Skip to content

Commit eed4a29

Browse files
committed
upgrade to latest
1 parent 0aeb76c commit eed4a29

File tree

5 files changed

+53
-15
lines changed

5 files changed

+53
-15
lines changed

api/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@ optimum[neural-compressor]==1.8.6
1515
neural-compressor==2.1.1
1616
optimum-intel==1.8.1
1717
nltk==3.8.1
18+
scikit-learn==1.2.2
19+
scikit-learn-intelex==2023.1.1
1820
webvtt-py==0.4.6

assets/EDA-Worldcloud.png

225 KB
Loading

nlp/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
# Preprocess course dataset (To maintain minimum text length of each subtitle)
3+
4+
- Run the `preprocess_subtitle.py` script
5+
6+
```python
7+
$ cd utils
8+
9+
$ python preprocess_subtitle.py --course_dir=../../dataset/courses --min_text_len=500
10+
```
11+
12+
# Perform Basic EDA on course dataset
13+
14+
- Run the `run_eda.py` script
15+
16+
```python
17+
$ cd utils
18+
19+
$ python run_eda.py --course_dir=../../dataset/courses
20+
```
21+
22+
![EDA](../assets/EDA-Worldcloud.png)

nlp/utils/preprocess_subtitle.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,24 +76,15 @@ def main(args):
7676
"--course_dir",
7777
type=str,
7878
help="base directory containing courses",
79-
default="dataset/courses"
79+
default="../../dataset/courses"
8080
)
8181
parser.add_argument(
8282
"--min_text_len",
8383
type=int,
8484
default=500,
8585
help="Minimum length of each subtitle text (in chars)"
8686
)
87-
parser.add_argument(
88-
"--output_dir",
89-
type=str,
90-
default=None,
91-
help="Output dir where preprocessed subtitles will be saved"
92-
)
93-
87+
9488
args = parser.parse_args()
9589

96-
if args.output_dir is not None:
97-
os.makedirs(args.output_dir, exist_ok=True)
98-
9990
main(args)

nlp/utils/run_eda.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,26 @@
1+
2+
#!/usr/bin/env python
3+
# coding=utf-8
4+
# Copyright 2023 C5ailabs Team All rights reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
"""
18+
Perform Basic EDA for course content.
19+
"""
120
import argparse
221
import os
322
import webvtt
4-
import modein.pandas as pd
23+
import modin.pandas as pd
524
import matplotlib.pyplot as plt
625
from wordcloud import WordCloud
726
from sklearn.feature_extraction.text import TfidfVectorizer
@@ -47,23 +66,27 @@ def generate_wordcloud(phrase_counts):
4766
return plt
4867

4968
def main(args):
50-
69+
5170
path = os.path.join(args.course_dir, "*/Study-Material/*/*/*.vtt")
5271
subtitle_fpaths = glob(path)
5372

73+
from distributed import Client
74+
client = Client()
75+
5476
df = read_subtitle_vtt(subtitle_fpaths)
5577

5678
phrase_counts, _, _ = extract_top_phrases_tfidf(df, 'caption_text')
5779
wordcloud = generate_wordcloud(phrase_counts)
58-
plt.show()
80+
plt.savefig('EDA-Worldcloud.png', bbox_inches='tight')
5981

6082
if __name__ == "__main__":
6183
parser = argparse.ArgumentParser(description='Perform Basic EDA on given dataset')
84+
6285
parser.add_argument(
6386
"--course_dir",
6487
type=str,
6588
help="base directory containing courses",
66-
default="dataset/courses"
89+
default="../../dataset/courses"
6790
)
6891

6992
args = parser.parse_args()

0 commit comments

Comments
 (0)