From c834a69edf124478f0bfbce278a549757e1c1443 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Wed, 11 Jun 2025 18:34:57 -0400 Subject: [PATCH 01/11] TutorTask554: Add a modified version of Aayush's code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../docker_data605_style/__init__.py | 0 DATA605/project_description.py | 183 ++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py create mode 100644 DATA605/project_description.py diff --git a/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py b/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/DATA605/project_description.py b/DATA605/project_description.py new file mode 100644 index 0000000000..05fa145ef4 --- /dev/null +++ b/DATA605/project_description.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +""" +Generate project descriptions from a Google Sheet and save them to a Markdown +file. + +> project_description.py \ + --sheet_url "https://docs.google.com/spreadsheets/d/1abc.../edit#gid=0" \ + --markdown_path ./projects/MSML610_Projects.md \ + --max_projects 3 \ + -v INFO + +Import as: + +import DATA605.project_description as dprodesc +""" + +import argparse +import logging +import pathlib +import time +from typing import Any, Optional + +import pandas as pd + +import helpers_root.helpers.hdbg as hdbg +import helpers_root.helpers.hgoogle_drive_api as hgofiapi +import helpers_root.helpers.hio as hio +import helpers_root.helpers.hopenai as hopenai +import helpers_root.helpers.hparser as hparser + +_LOG = logging.getLogger(__name__) + +# Set Constants. +DEFAULT_SHEET_URL = "https://docs.google.com/spreadsheets/d/1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/edit?gid=0#gid=0" +GLOBAL_PROMPT = """ +You are a college professor of Data science. +In the next prompt I will give you a topic XYZ for a class project and you will write a description using bullet points for a college class project about implementing an example big data system in Python. + +The project should be related to ingesting and processing real-time data about bitcoin. The focus should be on the technology XYZ, using basic Python packages for anything else. + +The assignment requires to describe the basic functionalities of the package using examples and then a concrete project related to implementing something related to time series analysis. +The complexity of the project is 1, where 1 is easy (it should take around 7 days) to develop, 2 is medium difficulty (it should take around 10 days to complete), 3 is hard (it should take 14 days to complete). + +The output should follow the template below +Title: +Difficulty: (1=easy, 3=difficult) +Description +Describe technology +Describe the project +Useful resources +Is it free? +Python libraries / bindings +""" + +EXAMPLE = """Example: +Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) +Difficulty: 1 +Description +AWS Glue is a fully managed extract, transform, and load (ETL) service... +Useful resources: AWS Glue Docs +Is it free?: Free tier available with limits +Python libraries: boto3, PySpark +""" +DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" +# The maximum number of projects. +# Set the value to None to disable the limit. +DEFAULT_MAX_PROJECTS = 5 + + +def read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: + """ + Read the Google Sheet and return the data as a pandas DataFrame. + + :param url: the URL of the Google Sheet to read + :param secret_path: path to google_secret.json + :return: the data + """ + credentials = hgofiapi.get_credentials(service_key_path=secret_path) + df = hgofiapi.read_google_file(url, credentials=credentials) + return df + + +def generate_project_description(project_name: str, difficulty: str) -> Any: + """ + Generate a project description. + + :param project_name: the name of the project + :param difficulty: the difficulty level of the project + :return: the project description + """ + # Generate the project description. + # prompt = f"Generate a project description for '{project_name}' with difficulty level '{difficulty}'." + # prompt = PROMPT_DOC_URL.strip()+ "\n\n"+ EXAMPLE.strip()+ f"\n\nTechnology: {project_name}\nDifficulty: {difficulty}" + # description = hopenai.get_completion(prompt, model="gpt-4o-mini") + # return description + prompt = f"Technology: {project_name}\nDifficulty: {difficulty}" + project_desc = hopenai.get_completion( + prompt, + system_prompt=GLOBAL_PROMPT, + model="gpt-4o-mini", + cache_mode="FALLBACK", + print_cost=True, + ) + return project_desc + + +def create_markdown_file( + df: pd.DataFrame, + markdown_path: str, + max_projects: Optional[int], + *, + sleep_sec: float = 1.5, +) -> None: + """ + Create a markdown file with the project descriptions using helpers.hio. + + :param df: the dataframe containing the project descriptions + :param markdown_path: the path to the markdown file + :param max_projects: limit to the rows processed + :param sleep_sec: amount of time to sleep between rows + """ + content = "# MSML610 Projects\n\n" + # Generate the project descriptions. + # Limit the number of projects. + rows = df.head(max_projects) if max_projects is not None else df + for _, row in rows.iterrows(): + project_name = row["Tool"] + difficulty = row["Difficulty"] + description = generate_project_description(project_name, difficulty) + # Add the project description to the markdown file. + content += f"## {project_name}\n" + content += f"{description}\n\n" + time.sleep(sleep_sec) + # Write the markdown file. + hio.to_file(markdown_path, content) + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL" + ) + parser.add_argument( + "--secret_path", + default="/app/DATA605/google_secret.json", + help="Path to Google service‑account JSON.", + ) + parser.add_argument( + "--markdown_path", + default=DEFAULT_MARKDOWN_PATH, + help="Output Markdown file", + ) + parser.add_argument( + "--max_projects", + type=int, + default=DEFAULT_MAX_PROJECTS, + help="Limit rows processed (None = all).", + ) + hparser.add_verbosity_arg(parser) # adds -v / --log_level + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Expand user/relative paths to absolute ones early to avoid surprises. + secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve()) + markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve()) + _LOG.info("Reading sheet %s", args.sheet_url) + sheet_df = read_google_sheet(args.sheet_url, secret_path) + _LOG.info("Generating Markdown → %s", markdown_path) + create_markdown_file( + sheet_df, + markdown_path, + args.max_projects, + ) + _LOG.info("Done: %s", markdown_path) + + +if __name__ == "__main__": + _main(_parse()) From c4a131a225146c6fbdb45f43cbd85d0e0cdc8d81 Mon Sep 17 00:00:00 2001 From: Indrayudd Roy Chowdhury Date: Wed, 11 Jun 2025 18:38:34 -0400 Subject: [PATCH 02/11] TutorTask554: Add a modified version of Aayush's code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- DATA605/project_description.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DATA605/project_description.py b/DATA605/project_description.py index 05fa145ef4..8a99f1c8c0 100644 --- a/DATA605/project_description.py +++ b/DATA605/project_description.py @@ -4,7 +4,7 @@ file. > project_description.py \ - --sheet_url "https://docs.google.com/spreadsheets/d/1abc.../edit#gid=0" \ + --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \ --markdown_path ./projects/MSML610_Projects.md \ --max_projects 3 \ -v INFO From 3b1fd492ed00d4a220909faa9a78346ad91fa0aa Mon Sep 17 00:00:00 2001 From: Aayush Date: Wed, 11 Jun 2025 22:36:39 -0500 Subject: [PATCH 03/11] TutorTask554: Adding documentation and script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- DATA605/project_description.py | 10 ++- docs/project_description_explanation.md | 77 ++++++++++++++++++++++++ docs/project_description_how_to_guide.md | 76 +++++++++++++++++++++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 docs/project_description_explanation.md create mode 100644 docs/project_description_how_to_guide.md diff --git a/DATA605/project_description.py b/DATA605/project_description.py index 8a99f1c8c0..16b7a4af2c 100644 --- a/DATA605/project_description.py +++ b/DATA605/project_description.py @@ -64,7 +64,7 @@ DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" # The maximum number of projects. # Set the value to None to disable the limit. -DEFAULT_MAX_PROJECTS = 5 +DEFAULT_MAX_PROJECTS = None def read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: @@ -75,6 +75,8 @@ def read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: :param secret_path: path to google_secret.json :return: the data """ + _LOG.info(f"Reading Google Sheet: {url}") + _LOG.info(f"Using credentials from: {secret_path}") credentials = hgofiapi.get_credentials(service_key_path=secret_path) df = hgofiapi.read_google_file(url, credentials=credentials) return df @@ -158,6 +160,12 @@ def _parse() -> argparse.ArgumentParser: default=DEFAULT_MAX_PROJECTS, help="Limit rows processed (None = all).", ) + parser.add_argument( + "--openai_key", + type=str, + default=None, + help="OpenAI API key (will override env var)", +) hparser.add_verbosity_arg(parser) # adds -v / --log_level return parser diff --git a/docs/project_description_explanation.md b/docs/project_description_explanation.md new file mode 100644 index 0000000000..bf82318254 --- /dev/null +++ b/docs/project_description_explanation.md @@ -0,0 +1,77 @@ +# Explanation: `project_description.py` + + + +- [Introduction and motivation](#introduction-and-motivation) +- [Core Concepts](#core-concepts) +- [How It Works](#how-it-works) +- [Design Rationale](#design-rationale) +- [Trade-offs and Alternatives](#trade-offs-and-alternatives) + + + +## Introduction and motivation + +- This tool automates the generation of academic project descriptions by + integrating Google Sheets input with OpenAI's GPT API. +- It addresses the need for scalable, consistent, and high-quality project + documentation based on dynamic student or faculty input. +- It is intended for course instructors, academic administrators, or data + science curriculum designers who wish to streamline project generation and + documentation. + +## Core Concepts + +- **Google Sheets Integration:** Uses Google Sheets as the dynamic data source + for project names and difficulty levels. +- **Prompt Engineering:** A pre-defined prompt template guides GPT to produce + structured project descriptions. +- **Markdown Generation:** Outputs the generated content into a formatted + Markdown file for easy distribution. +- **Helper Modules:** External utility modules (`hgoogle_file_api`, `hopenai`, + `hio`) abstract authentication, I/O, and API interaction. + +## How It Works + +- The script follows this control flow: + + ```markdown + [Google Sheet URL] → read_google_sheet() → [DataFrame of projects] → loop → + generate_project_description() → [GPT-generated text] → create_markdown_file() + → [Markdown output] + ``` + +- Key Functions: + - `read_google_sheet(url)`: Reads spreadsheet and returns a pandas DataFrame. + - `generate_project_description(project_name, difficulty)`: Sends input to + GPT-4o-mini model and returns generated text. + - `create_markdown_file(df, markdown_file_path)`: Iterates over the DataFrame, + generates description for each row, and writes it to a Markdown file. + +## Design Rationale + +- **Automation Focus:** Built to minimize manual work for faculty managing large + project datasets. +- **Modular Helpers:** Offloading I/O and API logic to separate modules makes + this script easier to maintain or port. +- **GPT as Content Generator:** Using GPT-4o-mini allows flexibility and + high-quality text output with minimal prompt tuning. + +## Trade-offs and Alternatives + +- **Current Approach:** + - Advantages: + - Automated, reproducible, and scalable. + - Maintains separation of logic (reading input, generating content, writing + file). + - Drawbacks: + - Dependent on OpenAI and Google APIs (connectivity and API keys required). + - Limited error handling and logging for individual failures. + +- **Alternative Approach:** + - Using a GUI-based application or Jupyter notebook for manual review and + editing. + - Advantages: + - Allows user customization and validation at each step. + - Drawbacks: + - Slower and less scalable; not suitable for batch generation. diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md new file mode 100644 index 0000000000..f914af5d3f --- /dev/null +++ b/docs/project_description_how_to_guide.md @@ -0,0 +1,76 @@ + + + * [What It Does](#what-it-does) + * [Assumptions / Requirements](#assumptions--requirements) + * [Instructions](#instructions) + + [Step 1: Fetch Input](#step-1-fetch-input) +- [Edit GOOGLE_SHEET_URL inside the script or pass a new one to read_google_sheet()](#edit-google_sheet_url-inside-the-script-or-pass-a-new-one-to-read_google_sheet) + * [Step 2: Describe Action](#step-2-describe-action) + + [Step 3: Review Output](#step-3-review-output) + * [Troubleshooting](#troubleshooting) + + + +# What It Does + +- Automates the process of generating academic project descriptions by: + - Reading project data from a Google Sheet. + - Using OpenAI's API to auto-generate detailed project descriptions. + - Saving the final output in a formatted Markdown file for distribution. + +## Assumptions / Requirements + +- Google Cloud service key file at `/app/DATA605/google_secret.json` +- Docker running +- Valid OpenAI API key for model access +- Project-specific helper modules must be available: + - Helpers.hgoogle_file_api + - Helpers.hio + - Helpers.hopenai + +## Instructions + +### Step 1: Fetch Input + +Ensure the Google Sheet is publicly accessible or shared with the configured +service account. + +The Google Sheet should contain: + +- Project name + +- Difficulty + +# Edit GOOGLE_SHEET_URL inside the script or pass a new one to read_google_sheet() + +URL="https://docs.google.com/spreadsheets/d//edit" + +## Step 2: Describe Action + +- Run the script directly using Python +- This will: + + Authenticate and read the Google Sheet + + Generate a project description using OpenAI for each row + + Save the top 5 (or all if MAX_PROJECTS=None) projects in a file called + `./projects/DATA605_Projects.md` + +### Step 3: Review Output + +- Navigate to the projects/ folder and open DATA605_Projects.md. + +## Troubleshooting + +Issue: google.auth.exceptions.DefaultCredentialsError Cause: Google service key +not found at the expected path. Fix: Place the correct google_secret.json file +in /app/DATA605/. + +Issue: ModuleNotFoundError: No module named 'helpers' Cause: Missing local +helper modules. Fix: Ensure helpers/ directory is in your PYTHONPATH or the same +directory as the script. + +Issue: Empty or incomplete output file Cause: API failure or invalid sheet +format. Fix: Check logs, verify if the OpenAI and Google API calls are working, +and ensure data in the Google Sheet is structured correctly. From 570c1f3fbdab2075320102b6804abe0e3d68b40e Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 00:00:44 -0500 Subject: [PATCH 04/11] Adding unit test file and improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- DATA605/project_description.py | 57 +++++++++++++++++--------- DATA605/test_project_description.py | 63 +++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 DATA605/test_project_description.py diff --git a/DATA605/project_description.py b/DATA605/project_description.py index 16b7a4af2c..ca5fdedbf5 100644 --- a/DATA605/project_description.py +++ b/DATA605/project_description.py @@ -32,26 +32,41 @@ # Set Constants. DEFAULT_SHEET_URL = "https://docs.google.com/spreadsheets/d/1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/edit?gid=0#gid=0" -GLOBAL_PROMPT = """ -You are a college professor of Data science. -In the next prompt I will give you a topic XYZ for a class project and you will write a description using bullet points for a college class project about implementing an example big data system in Python. - -The project should be related to ingesting and processing real-time data about bitcoin. The focus should be on the technology XYZ, using basic Python packages for anything else. - -The assignment requires to describe the basic functionalities of the package using examples and then a concrete project related to implementing something related to time series analysis. -The complexity of the project is 1, where 1 is easy (it should take around 7 days) to develop, 2 is medium difficulty (it should take around 10 days to complete), 3 is hard (it should take 14 days to complete). - -The output should follow the template below -Title: -Difficulty: (1=easy, 3=difficult) -Description -Describe technology -Describe the project -Useful resources -Is it free? -Python libraries / bindings -""" - +# GLOBAL_PROMPT = """ +# You are a college professor of Data Science. +# I will give you a topic XYZ for a class project. +# Your task is to generate a short, structured project brief for college students focused on implementing a big data system in Python, using the technology XYZ. + +# Requirements: +# The project must involve real-time ingestion and processing of Bitcoin data. +# Emphasize how XYZ helps in this context. +# The response should be concise and in bullet points only. +# Avoid long descriptions or step-by-step guides. +# The project must include a time series analysis component. +# The complexity of the project should range from 1, where 1 is easy (it should take around 7 days) to develop, 2 is medium difficulty (it should take around 10 days to complete), 3 is hard (it should take 14 days to complete). + +# The output should follow the template below +# Title: +# Difficulty: (1=easy, 3=difficult) +# Description +# Describe technology +# Describe the project +# Useful resources +# Is it free? +# Python libraries / bindings +# """ +GLOBAL_PROMPT='''Act as a data science professor. I will give you a tool (XYZ) and difficulty level (1–3). Write a short bullet-point project brief on how XYZ can be used for real-time Bitcoin data ingestion in Python. Include: + +- Title +- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete) +- Tech Description +- Project Idea +- Python libs +- Is it Free? +- Relevant tool(XYZ) related Resource Links + +Avoid long texts or steps +''' EXAMPLE = """Example: Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) Difficulty: 1 @@ -101,6 +116,8 @@ def generate_project_description(project_name: str, difficulty: str) -> Any: system_prompt=GLOBAL_PROMPT, model="gpt-4o-mini", cache_mode="FALLBACK", + temperature=0.3, + max_tokens=400, print_cost=True, ) return project_desc diff --git a/DATA605/test_project_description.py b/DATA605/test_project_description.py new file mode 100644 index 0000000000..2a5b58a518 --- /dev/null +++ b/DATA605/test_project_description.py @@ -0,0 +1,63 @@ +import logging +from unittest import mock + +import pandas as pd + +import helpers_root.helpers.hunit_test as hunitest +import DATA605.project_description as projdesc + +_LOG = logging.getLogger(__name__) + +class TestProjectDescription1(hunitest.TestCase): + def test_read_google_sheet1(self) -> None: + """ + Test reading a Google Sheet returns a valid DataFrame. + """ + url = "https://docs.google.com/fake-sheet-url" + secret_path = "/fake/path/to/secret.json" + + mock_data = pd.DataFrame({ + "Tool": ["Kafka"], + "Difficulty": ["2"] + }) + + with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials") as mock_creds, \ + mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): + df = projdesc.read_google_sheet(url, secret_path) + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(df.shape[0], 1) + _LOG.debug("read_google_sheet1 → %s", df) + + def test_generate_project_description1(self) -> None: + """ + Test project description generation using mocked OpenAI. + """ + tech = "Kafka" + difficulty = "2" + + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): + desc = projdesc.generate_project_description(tech, difficulty) + self.assertIn("Kafka", desc) + self.assertIn("Difficulty", desc) + _LOG.debug("generate_project_description1 → %s", desc) + + def test_create_markdown_file1(self) -> None: + """ + Test the markdown creation process with mocked data and completion. + """ + df = pd.DataFrame({ + "Tool": ["Kafka"], + "Difficulty": ["2"] + }) + markdown_path = "/tmp/test_projects.md" + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output), \ + mock.patch("helpers_root.helpers.hio.to_file") as mock_to_file: + projdesc.create_markdown_file(df, markdown_path, max_projects=1, sleep_sec=0) + mock_to_file.assert_called_once() + written_content = mock_to_file.call_args[0][1] + self.assertIn("Kafka", written_content) + _LOG.debug("create_markdown_file1 content →\n%s", written_content) From 80f8482ccc1e6b4b860615bf0fd4f3a3539c65c5 Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 11:25:48 -0500 Subject: [PATCH 05/11] TutorTask554: Cosmetic changes to documentation file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- docs/project_description_explanation.md | 7 +++---- docs/project_description_how_to_guide.md | 17 ++++++++--------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/docs/project_description_explanation.md b/docs/project_description_explanation.md index bf82318254..e4fff09c6c 100644 --- a/docs/project_description_explanation.md +++ b/docs/project_description_explanation.md @@ -13,11 +13,10 @@ ## Introduction and motivation - This tool automates the generation of academic project descriptions by - integrating Google Sheets input with OpenAI's GPT API. + integrating Google Sheets input with OpenAI API. - It addresses the need for scalable, consistent, and high-quality project documentation based on dynamic student or faculty input. -- It is intended for course instructors, academic administrators, or data - science curriculum designers who wish to streamline project generation and +- It is intended to streamline and automate project generation and documentation. ## Core Concepts @@ -37,7 +36,7 @@ ```markdown [Google Sheet URL] → read_google_sheet() → [DataFrame of projects] → loop → - generate_project_description() → [GPT-generated text] → create_markdown_file() + Create prompt and feed into GPT → [GPT-generated text] → create_markdown_file() → [Markdown output] ``` diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md index f914af5d3f..7631dd1747 100644 --- a/docs/project_description_how_to_guide.md +++ b/docs/project_description_how_to_guide.md @@ -11,7 +11,7 @@ -# What It Does +## What It Does - Automates the process of generating academic project descriptions by: - Reading project data from a Google Sheet. @@ -20,7 +20,7 @@ ## Assumptions / Requirements -- Google Cloud service key file at `/app/DATA605/google_secret.json` +- Google Cloud service key file ready to use - Docker running - Valid OpenAI API key for model access - Project-specific helper modules must be available: @@ -35,13 +35,16 @@ Ensure the Google Sheet is publicly accessible or shared with the configured service account. +For instructions on how to configure google sheets API, follow this link: +https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md + The Google Sheet should contain: - Project name - Difficulty -# Edit GOOGLE_SHEET_URL inside the script or pass a new one to read_google_sheet() +### Edit Google Sheet URL inside the script or pass a new one through CLI URL="https://docs.google.com/spreadsheets/d//edit" @@ -54,12 +57,12 @@ URL="https://docs.google.com/spreadsheets/d//edit" Generate a project description using OpenAI for each row - Save the top 5 (or all if MAX_PROJECTS=None) projects in a file called + Save the top N (or all if MAX_PROJECTS=None) projects in a file called `./projects/DATA605_Projects.md` ### Step 3: Review Output -- Navigate to the projects/ folder and open DATA605_Projects.md. +- Output markdown stored at DATA605/projects/MSML610_Projects.md. ## Troubleshooting @@ -67,10 +70,6 @@ Issue: google.auth.exceptions.DefaultCredentialsError Cause: Google service key not found at the expected path. Fix: Place the correct google_secret.json file in /app/DATA605/. -Issue: ModuleNotFoundError: No module named 'helpers' Cause: Missing local -helper modules. Fix: Ensure helpers/ directory is in your PYTHONPATH or the same -directory as the script. - Issue: Empty or incomplete output file Cause: API failure or invalid sheet format. Fix: Check logs, verify if the OpenAI and Google API calls are working, and ensure data in the Google Sheet is structured correctly. From 311799285b3d55cb71060f9587d0994dc7740097 Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 11:38:06 -0500 Subject: [PATCH 06/11] TutorTask554: Cosmetic changes to documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- docs/project_description_how_to_guide.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md index 7631dd1747..0ef91f53dc 100644 --- a/docs/project_description_how_to_guide.md +++ b/docs/project_description_how_to_guide.md @@ -1,13 +1,15 @@ +# How To Guide: `project_description.py` + - * [What It Does](#what-it-does) - * [Assumptions / Requirements](#assumptions--requirements) - * [Instructions](#instructions) - + [Step 1: Fetch Input](#step-1-fetch-input) -- [Edit GOOGLE_SHEET_URL inside the script or pass a new one to read_google_sheet()](#edit-google_sheet_url-inside-the-script-or-pass-a-new-one-to-read_google_sheet) - * [Step 2: Describe Action](#step-2-describe-action) - + [Step 3: Review Output](#step-3-review-output) - * [Troubleshooting](#troubleshooting) +- [What It Does](#what-it-does) +- [Assumptions / Requirements](#assumptions--requirements) +- [Instructions](#instructions) + * [Step 1: Fetch Input](#step-1-fetch-input) + * [Edit Google Sheet URL inside the script or pass a new one through CLI](#edit-google-sheet-url-inside-the-script-or-pass-a-new-one-through-cli) +- [Step 2: Script Execution](#step-2-script-execution) + * [Step 3: Review Output](#step-3-review-output) +- [Troubleshooting](#troubleshooting) @@ -36,7 +38,7 @@ Ensure the Google Sheet is publicly accessible or shared with the configured service account. For instructions on how to configure google sheets API, follow this link: -https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md +[https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md](https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md) The Google Sheet should contain: @@ -48,7 +50,7 @@ The Google Sheet should contain: URL="https://docs.google.com/spreadsheets/d//edit" -## Step 2: Describe Action +## Step 2: Script Execution - Run the script directly using Python - This will: @@ -62,7 +64,7 @@ URL="https://docs.google.com/spreadsheets/d//edit" ### Step 3: Review Output -- Output markdown stored at DATA605/projects/MSML610_Projects.md. +- Markdown stored at DATA605/projects/MSML610_Projects.md. ## Troubleshooting From db3b66dec27f821dfd05725db59540f62ab3a218 Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 11:55:10 -0500 Subject: [PATCH 07/11] TutorTask554: COsmetic changes to documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- docs/project_description_how_to_guide.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md index 0ef91f53dc..639a9c95ef 100644 --- a/docs/project_description_how_to_guide.md +++ b/docs/project_description_how_to_guide.md @@ -6,7 +6,6 @@ - [Assumptions / Requirements](#assumptions--requirements) - [Instructions](#instructions) * [Step 1: Fetch Input](#step-1-fetch-input) - * [Edit Google Sheet URL inside the script or pass a new one through CLI](#edit-google-sheet-url-inside-the-script-or-pass-a-new-one-through-cli) - [Step 2: Script Execution](#step-2-script-execution) * [Step 3: Review Output](#step-3-review-output) - [Troubleshooting](#troubleshooting) @@ -46,10 +45,6 @@ The Google Sheet should contain: - Difficulty -### Edit Google Sheet URL inside the script or pass a new one through CLI - -URL="https://docs.google.com/spreadsheets/d//edit" - ## Step 2: Script Execution - Run the script directly using Python @@ -62,6 +57,14 @@ URL="https://docs.google.com/spreadsheets/d//edit" Save the top N (or all if MAX_PROJECTS=None) projects in a file called `./projects/DATA605_Projects.md` +Code to run script: + +```bash +python /project_description.py --sheet_url --secret_path --openai_key key --markdown_path -v INFO +``` + +Edit Google Sheet URL inside the script or pass a new one through CLI + ### Step 3: Review Output - Markdown stored at DATA605/projects/MSML610_Projects.md. From eaeca22d7dd6fed7a7d2db2ac34203de306db3d1 Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 23:29:45 -0500 Subject: [PATCH 08/11] TutorTask554: Adding unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- DATA605/test_project_description.py | 51 ++++++++++------------------- 1 file changed, 17 insertions(+), 34 deletions(-) diff --git a/DATA605/test_project_description.py b/DATA605/test_project_description.py index 2a5b58a518..1699618d60 100644 --- a/DATA605/test_project_description.py +++ b/DATA605/test_project_description.py @@ -1,56 +1,40 @@ -import logging -from unittest import mock - import pandas as pd - -import helpers_root.helpers.hunit_test as hunitest +from unittest import mock import DATA605.project_description as projdesc +import helpers.hunit_test as hunitest +import pytest -_LOG = logging.getLogger(__name__) +class TestProjectDescriptionWithCache(hunitest.TestCase): -class TestProjectDescription1(hunitest.TestCase): - def test_read_google_sheet1(self) -> None: - """ - Test reading a Google Sheet returns a valid DataFrame. - """ + @pytest.fixture(autouse=True) + def setup_teardown(self): + # Can initialize dummy cache or files here + yield + # Avoid triggering _GLOBAL_CAPSYS in tearDown + + def test_read_google_sheet(self) -> None: url = "https://docs.google.com/fake-sheet-url" secret_path = "/fake/path/to/secret.json" + mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) - mock_data = pd.DataFrame({ - "Tool": ["Kafka"], - "Difficulty": ["2"] - }) - - with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials") as mock_creds, \ + with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \ mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): df = projdesc.read_google_sheet(url, secret_path) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(df.shape[0], 1) - _LOG.debug("read_google_sheet1 → %s", df) - def test_generate_project_description1(self) -> None: - """ - Test project description generation using mocked OpenAI. - """ + def test_generate_project_description(self) -> None: tech = "Kafka" difficulty = "2" - mock_output = "Title: Kafka Project\nDifficulty: 2\n..." with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): desc = projdesc.generate_project_description(tech, difficulty) self.assertIn("Kafka", desc) self.assertIn("Difficulty", desc) - _LOG.debug("generate_project_description1 → %s", desc) - - def test_create_markdown_file1(self) -> None: - """ - Test the markdown creation process with mocked data and completion. - """ - df = pd.DataFrame({ - "Tool": ["Kafka"], - "Difficulty": ["2"] - }) + + def test_create_markdown_file(self) -> None: + df = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) markdown_path = "/tmp/test_projects.md" mock_output = "Title: Kafka Project\nDifficulty: 2\n..." @@ -60,4 +44,3 @@ def test_create_markdown_file1(self) -> None: mock_to_file.assert_called_once() written_content = mock_to_file.call_args[0][1] self.assertIn("Kafka", written_content) - _LOG.debug("create_markdown_file1 content →\n%s", written_content) From 88d6c50b4c55863348ab4268ddc9d96ba782deae Mon Sep 17 00:00:00 2001 From: Aayush Date: Thu, 12 Jun 2025 23:32:28 -0500 Subject: [PATCH 09/11] TutorTask554:Fixing TOC in documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- docs/project_description_how_to_guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md index 639a9c95ef..19660d2e23 100644 --- a/docs/project_description_how_to_guide.md +++ b/docs/project_description_how_to_guide.md @@ -6,7 +6,7 @@ - [Assumptions / Requirements](#assumptions--requirements) - [Instructions](#instructions) * [Step 1: Fetch Input](#step-1-fetch-input) -- [Step 2: Script Execution](#step-2-script-execution) + * [Step 2: Script Execution](#step-2-script-execution) * [Step 3: Review Output](#step-3-review-output) - [Troubleshooting](#troubleshooting) From afd491cfdd60659fc7156a1e3fc369fdc68ff7b1 Mon Sep 17 00:00:00 2001 From: Aayush Date: Mon, 16 Jun 2025 22:34:45 -0500 Subject: [PATCH 10/11] TutorTask554: Moving files to correct location and some minor file changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../__init__.py | 0 .../generate_class_project_description.py | 217 ++++++++++++++++++ .../project_description.py | 0 ...test_generate_class_project_description.py | 48 ++++ 4 files changed, 265 insertions(+) create mode 100644 tutorial_class_project_instructions/__init__.py create mode 100644 tutorial_class_project_instructions/generate_class_project_description.py rename {DATA605 => tutorial_class_project_instructions}/project_description.py (100%) create mode 100644 tutorial_class_project_instructions/test_generate_class_project_description.py diff --git a/tutorial_class_project_instructions/__init__.py b/tutorial_class_project_instructions/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tutorial_class_project_instructions/generate_class_project_description.py b/tutorial_class_project_instructions/generate_class_project_description.py new file mode 100644 index 0000000000..2cea6aed80 --- /dev/null +++ b/tutorial_class_project_instructions/generate_class_project_description.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +""" +Generate project descriptions from a Google Sheet and save them to a Markdown +file. + +> project_description.py \ + --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \ + --markdown_path ./projects/MSML610_Projects.md \ + --max_projects 3 \ + -v INFO + +Import as: + +import DATA605.project_description as dprodesc +""" + +import argparse +import logging +import pathlib +import time +from typing import Any, Optional + +import pandas as pd + +import helpers_root.helpers.hdbg as hdbg +import helpers_root.helpers.hgoogle_drive_api as hgofiapi +import helpers_root.helpers.hio as hio +import helpers_root.helpers.hopenai as hopenai +import helpers_root.helpers.hparser as hparser + +_LOG = logging.getLogger(__name__) + +# Set Constants. +if True: + DEFAULT_SHEET_URL = ( + "https://docs.google.com/" + "spreadsheets/d/" + "1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/" + "edit?gid=0#gid=0" + ) + # Set to True to use the actual spreadsheet link +else: + # Set to False for testing purposes + fake_url = "https://docs.google.com/fake-sheet-url" + DEFAULT_SHEET_URL = fake_url +GLOBAL_PROMPT = """Act as a data science professor. +I will give you a tool (XYZ) and difficulty level (1–3). +Write a short bullet-point project brief on how XYZ can be +used for real-time Bitcoin data ingestion in Python. +Include: + +- Title +- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete) +- Tech Description +- Project Idea +- Python libs +- Is it Free? +- Relevant tool(XYZ) related Resource Links + +Avoid long texts or steps +""" +EXAMPLE = """Example: +Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) +Difficulty: 1 +Description +AWS Glue is a fully managed extract, transform, and load (ETL) service... +Useful resources: AWS Glue Docs +Is it free?: Free tier available with limits +Python libraries: boto3, PySpark +""" +DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" +# The maximum number of projects. +# Set the value to None to disable the limit. +DEFAULT_MAX_PROJECTS = None + + +def _read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: + """ + Read the Google Sheet and return the data as a pandas DataFrame. + + :param url: the URL of the Google Sheet to read + :param secret_path: path to google_secret.json + :return: the data + """ + _LOG.info("Reading Google Sheet %s: ", url) + _LOG.info("Using credentials from: %s", secret_path) + credentials = hgofiapi.get_credentials(service_key_path=secret_path) + df = hgofiapi.read_google_file(url, credentials=credentials) + return df + + +def _generate_project_description(project_name: str, difficulty: str) -> Any: + """ + Generate a project description. + + :param project_name: the name of the project + :param difficulty: the difficulty level of the project + :return: the project description + """ + if False: + # Potential (v3) prompt if needed to use. + # Change False to True to use it. + prompt = ( + f"Write a professional and detailed project description" + f"for a data project titled '{project_name}'. " + f"Indicate the difficulty level as '{difficulty}', and include objectives, " + f"technologies used, and expected outcomes." + ) + # Will use more tokens, but might help produce a better result. + elif False: + # v1 (Original) prompt. + # Change False to True to use it. + prompt = ( + f"Generate a project description for '{project_name}'," + f"with difficulty level '{difficulty}'." + ) + else: + # v2: Added by Aayush as an improvement to optimize tokens + # while conveying the same information. + prompt = f"Technology: {project_name}\nDifficulty: {difficulty}" + # Short, to the point and concise. Saves the most tokens while achieving similar results. + project_desc = hopenai.get_completion( + prompt, + system_prompt=GLOBAL_PROMPT, + model="gpt-4o-mini", + cache_mode="FALLBACK", + temperature=0.3, + max_tokens=400, + print_cost=True, + ) + return project_desc + + +def create_markdown_file( + df: pd.DataFrame, + markdown_path: str, + max_projects: Optional[int], + *, + sleep_sec: float = 1.5, +) -> None: + """ + Create a markdown file with the project descriptions using helpers.hio. + + :param df: the dataframe containing the project descriptions + :param markdown_path: the path to the markdown file + :param max_projects: limit to the rows processed + :param sleep_sec: amount of time to sleep between rows + """ + content = "# MSML610 Projects\n\n" + # Generate the project descriptions. + # Limit the number of projects. + rows = df.head(max_projects) if max_projects is not None else df + for _, row in rows.iterrows(): + project_name = row["Tool"] + difficulty = row["Difficulty"] + description = _generate_project_description(project_name, difficulty) + # Add the project description to the markdown file. + content += f"## {project_name}\n" + content += f"{description}\n\n" + # Letting it wait for a while before triggering another request + time.sleep(sleep_sec) + # Write the markdown file. + hio.to_file(markdown_path, content) + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL" + ) + parser.add_argument( + "--secret_path", + default="/app/DATA605/google_secret.json", + help="Path to Google service‑account JSON.", + ) + parser.add_argument( + "--markdown_path", + default=DEFAULT_MARKDOWN_PATH, + help="Output Markdown file", + ) + parser.add_argument( + "--max_projects", + type=int, + default=DEFAULT_MAX_PROJECTS, + help="Limit rows processed (None = all).", + ) + parser.add_argument( + "--openai_key", + type=str, + default=None, + help="OpenAI API key (will override env var)", + ) + hparser.add_verbosity_arg(parser) + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Expand user/relative paths to absolute ones early to avoid surprises. + secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve()) + markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve()) + _LOG.info("Reading sheet %s", args.sheet_url) + sheet_df = _read_google_sheet(args.sheet_url, secret_path) + _LOG.info("Generating Markdown → %s", markdown_path) + create_markdown_file( + sheet_df, + markdown_path, + args.max_projects, + ) + _LOG.info("Done: %s", markdown_path) + + +if __name__ == "__main__": + _main(_parse()) diff --git a/DATA605/project_description.py b/tutorial_class_project_instructions/project_description.py similarity index 100% rename from DATA605/project_description.py rename to tutorial_class_project_instructions/project_description.py diff --git a/tutorial_class_project_instructions/test_generate_class_project_description.py b/tutorial_class_project_instructions/test_generate_class_project_description.py new file mode 100644 index 0000000000..449b9f92d6 --- /dev/null +++ b/tutorial_class_project_instructions/test_generate_class_project_description.py @@ -0,0 +1,48 @@ +import pandas as pd +from unittest import mock +import DATA605.project_description as projdesc +import helpers.hunit_test as hunitest +import pytest + +class TestProjectDescriptionWithCache(hunitest.TestCase): + + @pytest.fixture(autouse=True) + def setup_teardown(self): + # Can initialize dummy cache or files here + yield + # Avoid triggering _GLOBAL_CAPSYS in tearDown + + def test_read_google_sheet(self) -> None: + if False: + #Set to False for testing purposes + url = "https://docs.google.com/fake-sheet-url" + secret_path = "/fake/path/to/secret.json" + mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) + + with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \ + mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): + df = projdesc.read_google_sheet(url, secret_path) + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(df.shape[0], 1) + + def test_generate_project_description(self) -> None: + tech = "Kafka" + difficulty = "2" + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): + desc = projdesc.generate_project_description(tech, difficulty) + self.assertIn("Kafka", desc) + self.assertIn("Difficulty", desc) + + def test_create_markdown_file(self) -> None: + df = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) + markdown_path = "/tmp/test_projects.md" + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output), \ + mock.patch("helpers_root.helpers.hio.to_file") as mock_to_file: + projdesc.create_markdown_file(df, markdown_path, max_projects=1, sleep_sec=0) + mock_to_file.assert_called_once() + written_content = mock_to_file.call_args[0][1] + self.assertIn("Kafka", written_content) From c9b6ae736f3afee3d6190b1cd9be1a62ea96de6e Mon Sep 17 00:00:00 2001 From: Aayush Date: Mon, 16 Jun 2025 22:45:50 -0500 Subject: [PATCH 11/11] TutorTask554: Removing files from wrong location and adding them to tutorials/class project instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- DATA605/test_project_description.py | 46 ---- .../project_description.py | 208 ------------------ ...test_generate_class_project_description.py | 12 +- 3 files changed, 6 insertions(+), 260 deletions(-) delete mode 100644 DATA605/test_project_description.py delete mode 100644 tutorial_class_project_instructions/project_description.py diff --git a/DATA605/test_project_description.py b/DATA605/test_project_description.py deleted file mode 100644 index 1699618d60..0000000000 --- a/DATA605/test_project_description.py +++ /dev/null @@ -1,46 +0,0 @@ -import pandas as pd -from unittest import mock -import DATA605.project_description as projdesc -import helpers.hunit_test as hunitest -import pytest - -class TestProjectDescriptionWithCache(hunitest.TestCase): - - @pytest.fixture(autouse=True) - def setup_teardown(self): - # Can initialize dummy cache or files here - yield - # Avoid triggering _GLOBAL_CAPSYS in tearDown - - def test_read_google_sheet(self) -> None: - url = "https://docs.google.com/fake-sheet-url" - secret_path = "/fake/path/to/secret.json" - mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) - - with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \ - mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): - df = projdesc.read_google_sheet(url, secret_path) - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(df.shape[0], 1) - - def test_generate_project_description(self) -> None: - tech = "Kafka" - difficulty = "2" - mock_output = "Title: Kafka Project\nDifficulty: 2\n..." - - with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): - desc = projdesc.generate_project_description(tech, difficulty) - self.assertIn("Kafka", desc) - self.assertIn("Difficulty", desc) - - def test_create_markdown_file(self) -> None: - df = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) - markdown_path = "/tmp/test_projects.md" - mock_output = "Title: Kafka Project\nDifficulty: 2\n..." - - with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output), \ - mock.patch("helpers_root.helpers.hio.to_file") as mock_to_file: - projdesc.create_markdown_file(df, markdown_path, max_projects=1, sleep_sec=0) - mock_to_file.assert_called_once() - written_content = mock_to_file.call_args[0][1] - self.assertIn("Kafka", written_content) diff --git a/tutorial_class_project_instructions/project_description.py b/tutorial_class_project_instructions/project_description.py deleted file mode 100644 index ca5fdedbf5..0000000000 --- a/tutorial_class_project_instructions/project_description.py +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env python -""" -Generate project descriptions from a Google Sheet and save them to a Markdown -file. - -> project_description.py \ - --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \ - --markdown_path ./projects/MSML610_Projects.md \ - --max_projects 3 \ - -v INFO - -Import as: - -import DATA605.project_description as dprodesc -""" - -import argparse -import logging -import pathlib -import time -from typing import Any, Optional - -import pandas as pd - -import helpers_root.helpers.hdbg as hdbg -import helpers_root.helpers.hgoogle_drive_api as hgofiapi -import helpers_root.helpers.hio as hio -import helpers_root.helpers.hopenai as hopenai -import helpers_root.helpers.hparser as hparser - -_LOG = logging.getLogger(__name__) - -# Set Constants. -DEFAULT_SHEET_URL = "https://docs.google.com/spreadsheets/d/1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/edit?gid=0#gid=0" -# GLOBAL_PROMPT = """ -# You are a college professor of Data Science. -# I will give you a topic XYZ for a class project. -# Your task is to generate a short, structured project brief for college students focused on implementing a big data system in Python, using the technology XYZ. - -# Requirements: -# The project must involve real-time ingestion and processing of Bitcoin data. -# Emphasize how XYZ helps in this context. -# The response should be concise and in bullet points only. -# Avoid long descriptions or step-by-step guides. -# The project must include a time series analysis component. -# The complexity of the project should range from 1, where 1 is easy (it should take around 7 days) to develop, 2 is medium difficulty (it should take around 10 days to complete), 3 is hard (it should take 14 days to complete). - -# The output should follow the template below -# Title: -# Difficulty: (1=easy, 3=difficult) -# Description -# Describe technology -# Describe the project -# Useful resources -# Is it free? -# Python libraries / bindings -# """ -GLOBAL_PROMPT='''Act as a data science professor. I will give you a tool (XYZ) and difficulty level (1–3). Write a short bullet-point project brief on how XYZ can be used for real-time Bitcoin data ingestion in Python. Include: - -- Title -- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete) -- Tech Description -- Project Idea -- Python libs -- Is it Free? -- Relevant tool(XYZ) related Resource Links - -Avoid long texts or steps -''' -EXAMPLE = """Example: -Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) -Difficulty: 1 -Description -AWS Glue is a fully managed extract, transform, and load (ETL) service... -Useful resources: AWS Glue Docs -Is it free?: Free tier available with limits -Python libraries: boto3, PySpark -""" -DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" -# The maximum number of projects. -# Set the value to None to disable the limit. -DEFAULT_MAX_PROJECTS = None - - -def read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: - """ - Read the Google Sheet and return the data as a pandas DataFrame. - - :param url: the URL of the Google Sheet to read - :param secret_path: path to google_secret.json - :return: the data - """ - _LOG.info(f"Reading Google Sheet: {url}") - _LOG.info(f"Using credentials from: {secret_path}") - credentials = hgofiapi.get_credentials(service_key_path=secret_path) - df = hgofiapi.read_google_file(url, credentials=credentials) - return df - - -def generate_project_description(project_name: str, difficulty: str) -> Any: - """ - Generate a project description. - - :param project_name: the name of the project - :param difficulty: the difficulty level of the project - :return: the project description - """ - # Generate the project description. - # prompt = f"Generate a project description for '{project_name}' with difficulty level '{difficulty}'." - # prompt = PROMPT_DOC_URL.strip()+ "\n\n"+ EXAMPLE.strip()+ f"\n\nTechnology: {project_name}\nDifficulty: {difficulty}" - # description = hopenai.get_completion(prompt, model="gpt-4o-mini") - # return description - prompt = f"Technology: {project_name}\nDifficulty: {difficulty}" - project_desc = hopenai.get_completion( - prompt, - system_prompt=GLOBAL_PROMPT, - model="gpt-4o-mini", - cache_mode="FALLBACK", - temperature=0.3, - max_tokens=400, - print_cost=True, - ) - return project_desc - - -def create_markdown_file( - df: pd.DataFrame, - markdown_path: str, - max_projects: Optional[int], - *, - sleep_sec: float = 1.5, -) -> None: - """ - Create a markdown file with the project descriptions using helpers.hio. - - :param df: the dataframe containing the project descriptions - :param markdown_path: the path to the markdown file - :param max_projects: limit to the rows processed - :param sleep_sec: amount of time to sleep between rows - """ - content = "# MSML610 Projects\n\n" - # Generate the project descriptions. - # Limit the number of projects. - rows = df.head(max_projects) if max_projects is not None else df - for _, row in rows.iterrows(): - project_name = row["Tool"] - difficulty = row["Difficulty"] - description = generate_project_description(project_name, difficulty) - # Add the project description to the markdown file. - content += f"## {project_name}\n" - content += f"{description}\n\n" - time.sleep(sleep_sec) - # Write the markdown file. - hio.to_file(markdown_path, content) - - -def _parse() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument( - "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL" - ) - parser.add_argument( - "--secret_path", - default="/app/DATA605/google_secret.json", - help="Path to Google service‑account JSON.", - ) - parser.add_argument( - "--markdown_path", - default=DEFAULT_MARKDOWN_PATH, - help="Output Markdown file", - ) - parser.add_argument( - "--max_projects", - type=int, - default=DEFAULT_MAX_PROJECTS, - help="Limit rows processed (None = all).", - ) - parser.add_argument( - "--openai_key", - type=str, - default=None, - help="OpenAI API key (will override env var)", -) - hparser.add_verbosity_arg(parser) # adds -v / --log_level - return parser - - -def _main(parser: argparse.ArgumentParser) -> None: - args = parser.parse_args() - hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) - # Expand user/relative paths to absolute ones early to avoid surprises. - secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve()) - markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve()) - _LOG.info("Reading sheet %s", args.sheet_url) - sheet_df = read_google_sheet(args.sheet_url, secret_path) - _LOG.info("Generating Markdown → %s", markdown_path) - create_markdown_file( - sheet_df, - markdown_path, - args.max_projects, - ) - _LOG.info("Done: %s", markdown_path) - - -if __name__ == "__main__": - _main(_parse()) diff --git a/tutorial_class_project_instructions/test_generate_class_project_description.py b/tutorial_class_project_instructions/test_generate_class_project_description.py index 449b9f92d6..a9ec99ba3d 100644 --- a/tutorial_class_project_instructions/test_generate_class_project_description.py +++ b/tutorial_class_project_instructions/test_generate_class_project_description.py @@ -1,6 +1,6 @@ import pandas as pd from unittest import mock -import DATA605.project_description as projdesc +import tutorial_class_project_instructions.generate_class_project_description as projdesc import helpers.hunit_test as hunitest import pytest @@ -13,25 +13,25 @@ def setup_teardown(self): # Avoid triggering _GLOBAL_CAPSYS in tearDown def test_read_google_sheet(self) -> None: - if False: - #Set to False for testing purposes + if True: + #Set to True for testing purposes url = "https://docs.google.com/fake-sheet-url" secret_path = "/fake/path/to/secret.json" mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \ mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): - df = projdesc.read_google_sheet(url, secret_path) + df = projdesc._read_google_sheet(url, secret_path) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(df.shape[0], 1) - def test_generate_project_description(self) -> None: + def test_generate_class_project_description(self) -> None: tech = "Kafka" difficulty = "2" mock_output = "Title: Kafka Project\nDifficulty: 2\n..." with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): - desc = projdesc.generate_project_description(tech, difficulty) + desc = projdesc._generate_project_description(tech, difficulty) self.assertIn("Kafka", desc) self.assertIn("Difficulty", desc)