upgrade to latest

rohitc5 · rohitc5 · commit eed4a29c45f2 · 2023-06-11T20:57:44.000+05:30
diff --git a/api/requirements.txt b/api/requirements.txt
@@ -15,4 +15,6 @@ optimum[neural-compressor]==1.8.6
 neural-compressor==2.1.1
 optimum-intel==1.8.1
 nltk==3.8.1
+scikit-learn==1.2.2
+scikit-learn-intelex==2023.1.1
 webvtt-py==0.4.6
diff --git a/assets/EDA-Worldcloud.png b/assets/EDA-Worldcloud.png
diff --git a/nlp/README.md b/nlp/README.md
@@ -0,0 +1,22 @@
+
+# Preprocess course dataset (To maintain minimum text length of each subtitle)
+
+- Run the `preprocess_subtitle.py` script
+
+```python
+   $ cd utils
+
+   $ python preprocess_subtitle.py --course_dir=../../dataset/courses --min_text_len=500
+```
+
+# Perform Basic EDA on course dataset
+
+- Run the `run_eda.py` script
+
+```python
+ $ cd utils
+
+ $ python run_eda.py --course_dir=../../dataset/courses
+```
+
+![EDA](../assets/EDA-Worldcloud.png)
diff --git a/nlp/utils/preprocess_subtitle.py b/nlp/utils/preprocess_subtitle.py
@@ -76,24 +76,15 @@ def main(args):
         "--course_dir",
         type=str,
         help="base directory containing courses",
-        default="dataset/courses"
+        default="../../dataset/courses"
     )
     parser.add_argument(
         "--min_text_len",
         type=int,
         default=500,
         help="Minimum length of each subtitle text (in chars)"
     )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=None,
-        help="Output dir where preprocessed subtitles will be saved"
-    )
-
+    
     args = parser.parse_args()
 
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
     main(args)
diff --git a/nlp/utils/run_eda.py b/nlp/utils/run_eda.py
@@ -1,7 +1,26 @@
+
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 C5ailabs Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Perform Basic EDA for course content.
+"""
 import argparse
 import os
 import webvtt
-import modein.pandas as pd
+import modin.pandas as pd
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -47,23 +66,27 @@ def generate_wordcloud(phrase_counts):
     return plt
 
 def main(args):
-
+    
     path = os.path.join(args.course_dir, "*/Study-Material/*/*/*.vtt")
     subtitle_fpaths = glob(path)
 
+    from distributed import Client
+    client = Client()
+
     df = read_subtitle_vtt(subtitle_fpaths)
 
     phrase_counts, _, _ = extract_top_phrases_tfidf(df, 'caption_text')
     wordcloud = generate_wordcloud(phrase_counts)
-    plt.show()
+    plt.savefig('EDA-Worldcloud.png', bbox_inches='tight')
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Perform Basic EDA on given dataset')
+
     parser.add_argument(
         "--course_dir",
         type=str,
         help="base directory containing courses",
-        default="dataset/courses"
+        default="../../dataset/courses"
     )
     
     args = parser.parse_args()