diff --git a/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py b/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/project_description_explanation.md b/docs/project_description_explanation.md new file mode 100644 index 0000000000..e4fff09c6c --- /dev/null +++ b/docs/project_description_explanation.md @@ -0,0 +1,76 @@ +# Explanation: `project_description.py` + + + +- [Introduction and motivation](#introduction-and-motivation) +- [Core Concepts](#core-concepts) +- [How It Works](#how-it-works) +- [Design Rationale](#design-rationale) +- [Trade-offs and Alternatives](#trade-offs-and-alternatives) + + + +## Introduction and motivation + +- This tool automates the generation of academic project descriptions by + integrating Google Sheets input with OpenAI API. +- It addresses the need for scalable, consistent, and high-quality project + documentation based on dynamic student or faculty input. +- It is intended to streamline and automate project generation and + documentation. + +## Core Concepts + +- **Google Sheets Integration:** Uses Google Sheets as the dynamic data source + for project names and difficulty levels. +- **Prompt Engineering:** A pre-defined prompt template guides GPT to produce + structured project descriptions. +- **Markdown Generation:** Outputs the generated content into a formatted + Markdown file for easy distribution. +- **Helper Modules:** External utility modules (`hgoogle_file_api`, `hopenai`, + `hio`) abstract authentication, I/O, and API interaction. + +## How It Works + +- The script follows this control flow: + + ```markdown + [Google Sheet URL] → read_google_sheet() → [DataFrame of projects] → loop → + Create prompt and feed into GPT → [GPT-generated text] → create_markdown_file() + → [Markdown output] + ``` + +- Key Functions: + - `read_google_sheet(url)`: Reads spreadsheet and returns a pandas DataFrame. + - `generate_project_description(project_name, difficulty)`: Sends input to + GPT-4o-mini model and returns generated text. + - `create_markdown_file(df, markdown_file_path)`: Iterates over the DataFrame, + generates description for each row, and writes it to a Markdown file. + +## Design Rationale + +- **Automation Focus:** Built to minimize manual work for faculty managing large + project datasets. +- **Modular Helpers:** Offloading I/O and API logic to separate modules makes + this script easier to maintain or port. +- **GPT as Content Generator:** Using GPT-4o-mini allows flexibility and + high-quality text output with minimal prompt tuning. + +## Trade-offs and Alternatives + +- **Current Approach:** + - Advantages: + - Automated, reproducible, and scalable. + - Maintains separation of logic (reading input, generating content, writing + file). + - Drawbacks: + - Dependent on OpenAI and Google APIs (connectivity and API keys required). + - Limited error handling and logging for individual failures. + +- **Alternative Approach:** + - Using a GUI-based application or Jupyter notebook for manual review and + editing. + - Advantages: + - Allows user customization and validation at each step. + - Drawbacks: + - Slower and less scalable; not suitable for batch generation. diff --git a/docs/project_description_how_to_guide.md b/docs/project_description_how_to_guide.md new file mode 100644 index 0000000000..19660d2e23 --- /dev/null +++ b/docs/project_description_how_to_guide.md @@ -0,0 +1,80 @@ +# How To Guide: `project_description.py` + + + +- [What It Does](#what-it-does) +- [Assumptions / Requirements](#assumptions--requirements) +- [Instructions](#instructions) + * [Step 1: Fetch Input](#step-1-fetch-input) + * [Step 2: Script Execution](#step-2-script-execution) + * [Step 3: Review Output](#step-3-review-output) +- [Troubleshooting](#troubleshooting) + + + +## What It Does + +- Automates the process of generating academic project descriptions by: + - Reading project data from a Google Sheet. + - Using OpenAI's API to auto-generate detailed project descriptions. + - Saving the final output in a formatted Markdown file for distribution. + +## Assumptions / Requirements + +- Google Cloud service key file ready to use +- Docker running +- Valid OpenAI API key for model access +- Project-specific helper modules must be available: + - Helpers.hgoogle_file_api + - Helpers.hio + - Helpers.hopenai + +## Instructions + +### Step 1: Fetch Input + +Ensure the Google Sheet is publicly accessible or shared with the configured +service account. + +For instructions on how to configure google sheets API, follow this link: +[https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md](https://github.com/causify-ai/helpers/blob/c50fddfdffccdccb1b2d963b729ab9674d8fda8f/docs/tools/notebooks/all.gsheet_into_pandas.how_to_guide.md) + +The Google Sheet should contain: + +- Project name + +- Difficulty + +## Step 2: Script Execution + +- Run the script directly using Python +- This will: + + Authenticate and read the Google Sheet + + Generate a project description using OpenAI for each row + + Save the top N (or all if MAX_PROJECTS=None) projects in a file called + `./projects/DATA605_Projects.md` + +Code to run script: + +```bash +python /project_description.py --sheet_url --secret_path --openai_key key --markdown_path -v INFO +``` + +Edit Google Sheet URL inside the script or pass a new one through CLI + +### Step 3: Review Output + +- Markdown stored at DATA605/projects/MSML610_Projects.md. + +## Troubleshooting + +Issue: google.auth.exceptions.DefaultCredentialsError Cause: Google service key +not found at the expected path. Fix: Place the correct google_secret.json file +in /app/DATA605/. + +Issue: Empty or incomplete output file Cause: API failure or invalid sheet +format. Fix: Check logs, verify if the OpenAI and Google API calls are working, +and ensure data in the Google Sheet is structured correctly. diff --git a/tutorial_class_project_instructions/__init__.py b/tutorial_class_project_instructions/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tutorial_class_project_instructions/generate_class_project_description.py b/tutorial_class_project_instructions/generate_class_project_description.py new file mode 100644 index 0000000000..2cea6aed80 --- /dev/null +++ b/tutorial_class_project_instructions/generate_class_project_description.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +""" +Generate project descriptions from a Google Sheet and save them to a Markdown +file. + +> project_description.py \ + --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \ + --markdown_path ./projects/MSML610_Projects.md \ + --max_projects 3 \ + -v INFO + +Import as: + +import DATA605.project_description as dprodesc +""" + +import argparse +import logging +import pathlib +import time +from typing import Any, Optional + +import pandas as pd + +import helpers_root.helpers.hdbg as hdbg +import helpers_root.helpers.hgoogle_drive_api as hgofiapi +import helpers_root.helpers.hio as hio +import helpers_root.helpers.hopenai as hopenai +import helpers_root.helpers.hparser as hparser + +_LOG = logging.getLogger(__name__) + +# Set Constants. +if True: + DEFAULT_SHEET_URL = ( + "https://docs.google.com/" + "spreadsheets/d/" + "1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/" + "edit?gid=0#gid=0" + ) + # Set to True to use the actual spreadsheet link +else: + # Set to False for testing purposes + fake_url = "https://docs.google.com/fake-sheet-url" + DEFAULT_SHEET_URL = fake_url +GLOBAL_PROMPT = """Act as a data science professor. +I will give you a tool (XYZ) and difficulty level (1–3). +Write a short bullet-point project brief on how XYZ can be +used for real-time Bitcoin data ingestion in Python. +Include: + +- Title +- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete) +- Tech Description +- Project Idea +- Python libs +- Is it Free? +- Relevant tool(XYZ) related Resource Links + +Avoid long texts or steps +""" +EXAMPLE = """Example: +Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) +Difficulty: 1 +Description +AWS Glue is a fully managed extract, transform, and load (ETL) service... +Useful resources: AWS Glue Docs +Is it free?: Free tier available with limits +Python libraries: boto3, PySpark +""" +DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" +# The maximum number of projects. +# Set the value to None to disable the limit. +DEFAULT_MAX_PROJECTS = None + + +def _read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: + """ + Read the Google Sheet and return the data as a pandas DataFrame. + + :param url: the URL of the Google Sheet to read + :param secret_path: path to google_secret.json + :return: the data + """ + _LOG.info("Reading Google Sheet %s: ", url) + _LOG.info("Using credentials from: %s", secret_path) + credentials = hgofiapi.get_credentials(service_key_path=secret_path) + df = hgofiapi.read_google_file(url, credentials=credentials) + return df + + +def _generate_project_description(project_name: str, difficulty: str) -> Any: + """ + Generate a project description. + + :param project_name: the name of the project + :param difficulty: the difficulty level of the project + :return: the project description + """ + if False: + # Potential (v3) prompt if needed to use. + # Change False to True to use it. + prompt = ( + f"Write a professional and detailed project description" + f"for a data project titled '{project_name}'. " + f"Indicate the difficulty level as '{difficulty}', and include objectives, " + f"technologies used, and expected outcomes." + ) + # Will use more tokens, but might help produce a better result. + elif False: + # v1 (Original) prompt. + # Change False to True to use it. + prompt = ( + f"Generate a project description for '{project_name}'," + f"with difficulty level '{difficulty}'." + ) + else: + # v2: Added by Aayush as an improvement to optimize tokens + # while conveying the same information. + prompt = f"Technology: {project_name}\nDifficulty: {difficulty}" + # Short, to the point and concise. Saves the most tokens while achieving similar results. + project_desc = hopenai.get_completion( + prompt, + system_prompt=GLOBAL_PROMPT, + model="gpt-4o-mini", + cache_mode="FALLBACK", + temperature=0.3, + max_tokens=400, + print_cost=True, + ) + return project_desc + + +def create_markdown_file( + df: pd.DataFrame, + markdown_path: str, + max_projects: Optional[int], + *, + sleep_sec: float = 1.5, +) -> None: + """ + Create a markdown file with the project descriptions using helpers.hio. + + :param df: the dataframe containing the project descriptions + :param markdown_path: the path to the markdown file + :param max_projects: limit to the rows processed + :param sleep_sec: amount of time to sleep between rows + """ + content = "# MSML610 Projects\n\n" + # Generate the project descriptions. + # Limit the number of projects. + rows = df.head(max_projects) if max_projects is not None else df + for _, row in rows.iterrows(): + project_name = row["Tool"] + difficulty = row["Difficulty"] + description = _generate_project_description(project_name, difficulty) + # Add the project description to the markdown file. + content += f"## {project_name}\n" + content += f"{description}\n\n" + # Letting it wait for a while before triggering another request + time.sleep(sleep_sec) + # Write the markdown file. + hio.to_file(markdown_path, content) + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL" + ) + parser.add_argument( + "--secret_path", + default="/app/DATA605/google_secret.json", + help="Path to Google service‑account JSON.", + ) + parser.add_argument( + "--markdown_path", + default=DEFAULT_MARKDOWN_PATH, + help="Output Markdown file", + ) + parser.add_argument( + "--max_projects", + type=int, + default=DEFAULT_MAX_PROJECTS, + help="Limit rows processed (None = all).", + ) + parser.add_argument( + "--openai_key", + type=str, + default=None, + help="OpenAI API key (will override env var)", + ) + hparser.add_verbosity_arg(parser) + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Expand user/relative paths to absolute ones early to avoid surprises. + secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve()) + markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve()) + _LOG.info("Reading sheet %s", args.sheet_url) + sheet_df = _read_google_sheet(args.sheet_url, secret_path) + _LOG.info("Generating Markdown → %s", markdown_path) + create_markdown_file( + sheet_df, + markdown_path, + args.max_projects, + ) + _LOG.info("Done: %s", markdown_path) + + +if __name__ == "__main__": + _main(_parse()) diff --git a/tutorial_class_project_instructions/test_generate_class_project_description.py b/tutorial_class_project_instructions/test_generate_class_project_description.py new file mode 100644 index 0000000000..a9ec99ba3d --- /dev/null +++ b/tutorial_class_project_instructions/test_generate_class_project_description.py @@ -0,0 +1,48 @@ +import pandas as pd +from unittest import mock +import tutorial_class_project_instructions.generate_class_project_description as projdesc +import helpers.hunit_test as hunitest +import pytest + +class TestProjectDescriptionWithCache(hunitest.TestCase): + + @pytest.fixture(autouse=True) + def setup_teardown(self): + # Can initialize dummy cache or files here + yield + # Avoid triggering _GLOBAL_CAPSYS in tearDown + + def test_read_google_sheet(self) -> None: + if True: + #Set to True for testing purposes + url = "https://docs.google.com/fake-sheet-url" + secret_path = "/fake/path/to/secret.json" + mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) + + with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \ + mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data): + df = projdesc._read_google_sheet(url, secret_path) + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(df.shape[0], 1) + + def test_generate_class_project_description(self) -> None: + tech = "Kafka" + difficulty = "2" + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output): + desc = projdesc._generate_project_description(tech, difficulty) + self.assertIn("Kafka", desc) + self.assertIn("Difficulty", desc) + + def test_create_markdown_file(self) -> None: + df = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]}) + markdown_path = "/tmp/test_projects.md" + mock_output = "Title: Kafka Project\nDifficulty: 2\n..." + + with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output), \ + mock.patch("helpers_root.helpers.hio.to_file") as mock_to_file: + projdesc.create_markdown_file(df, markdown_path, max_projects=1, sleep_sec=0) + mock_to_file.assert_called_once() + written_content = mock_to_file.call_args[0][1] + self.assertIn("Kafka", written_content)