causify-ai · aver81 · Jun 18, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 12, 2025
diff --git a/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py b/DATA605/Spring2025/projects/TutorTask112a/docker_data605_style/__init__.py
diff --git a/DATA605/project_description.py b/DATA605/project_description.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+"""
+Generate project descriptions from a Google Sheet and save them to a Markdown
+file.
+
+> project_description.py \
+    --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \
+    --markdown_path ./projects/MSML610_Projects.md \
+    --max_projects 3 \
+    -v INFO
+
+Import as:
+
+import DATA605.project_description as dprodesc
+"""
+
+import argparse
+import logging
+import pathlib
+import time
+from typing import Any, Optional
+
+import pandas as pd
+
+import helpers_root.helpers.hdbg as hdbg
+import helpers_root.helpers.hgoogle_drive_api as hgofiapi
+import helpers_root.helpers.hio as hio
+import helpers_root.helpers.hopenai as hopenai
+import helpers_root.helpers.hparser as hparser
+
+_LOG = logging.getLogger(__name__)
+
+# Set Constants.
+DEFAULT_SHEET_URL = "https://docs.google.com/spreadsheets/d/1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/edit?gid=0#gid=0"
+# GLOBAL_PROMPT = """
+# You are a college professor of Data Science.
+# I will give you a topic XYZ for a class project.
+# Your task is to generate a short, structured project brief for college students focused on implementing a big data system in Python, using the technology XYZ.
+
+# Requirements:
+#     The project must involve real-time ingestion and processing of Bitcoin data.
+#     Emphasize how XYZ helps in this context.
+#     The response should be concise and in bullet points only.
+#     Avoid long descriptions or step-by-step guides.
+#     The project must include a time series analysis component.
+# The complexity of the project should range from 1, where 1 is easy (it should take around 7 days) to develop, 2 is medium difficulty (it should take around 10 days to complete), 3 is hard (it should take 14 days to complete).
+
+# The output should follow the template below
+# Title:
+# Difficulty: (1=easy, 3=difficult)
+# Description
+# Describe technology
+# Describe the project
+# Useful resources
+# Is it free?
+# Python libraries / bindings
+# """
+GLOBAL_PROMPT='''Act as a data science professor. I will give you a tool (XYZ) and difficulty level (1–3). Write a short bullet-point project brief on how XYZ can be used for real-time Bitcoin data ingestion in Python. Include:
+
+- Title
+- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete)
+- Tech Description
+- Project Idea
+- Python libs
+- Is it Free?
+- Relevant tool(XYZ) related Resource Links
+
+Avoid long texts or steps
+'''
+EXAMPLE = """Example:
+Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ)
+Difficulty: 1
+Description
+AWS Glue is a fully managed extract, transform, and load (ETL) service...
+Useful resources: AWS Glue Docs
+Is it free?: Free tier available with limits
+Python libraries: boto3, PySpark
+"""
+DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md"
+# The maximum number of projects.
+# Set the value to None to disable the limit.
+DEFAULT_MAX_PROJECTS = None
+
+
+def read_google_sheet(url: str, secret_path: str) -> pd.DataFrame:
+    """
+    Read the Google Sheet and return the data as a pandas DataFrame.
+
+    :param url: the URL of the Google Sheet to read
+    :param secret_path: path to google_secret.json
+    :return: the data
+    """
+    _LOG.info(f"Reading Google Sheet: {url}")
+    _LOG.info(f"Using credentials from: {secret_path}")
+    credentials = hgofiapi.get_credentials(service_key_path=secret_path)
+    df = hgofiapi.read_google_file(url, credentials=credentials)
+    return df
+
+
+def generate_project_description(project_name: str, difficulty: str) -> Any:
+    """
+    Generate a project description.
+
+    :param project_name: the name of the project
+    :param difficulty: the difficulty level of the project
+    :return: the project description
+    """
+    # Generate the project description.
+    # prompt = f"Generate a project description for '{project_name}' with difficulty level '{difficulty}'."
+    # prompt = PROMPT_DOC_URL.strip()+ "\n\n"+ EXAMPLE.strip()+ f"\n\nTechnology: {project_name}\nDifficulty: {difficulty}"
+    # description = hopenai.get_completion(prompt, model="gpt-4o-mini")
+    # return description
+    prompt = f"Technology: {project_name}\nDifficulty: {difficulty}"
+    project_desc = hopenai.get_completion(
+        prompt,
+        system_prompt=GLOBAL_PROMPT,
+        model="gpt-4o-mini",
+        cache_mode="FALLBACK",
+        temperature=0.3,
+        max_tokens=400,
+        print_cost=True,
+    )
+    return project_desc
+
+
+def create_markdown_file(
+    df: pd.DataFrame,
+    markdown_path: str,
+    max_projects: Optional[int],
+    *,
+    sleep_sec: float = 1.5,
+) -> None:
+    """
+    Create a markdown file with the project descriptions using helpers.hio.
+
+    :param df: the dataframe containing the project descriptions
+    :param markdown_path: the path to the markdown file
+    :param max_projects: limit to the rows processed
+    :param sleep_sec: amount of time to sleep between rows
+    """
+    content = "# MSML610 Projects\n\n"
+    # Generate the project descriptions.
+    # Limit the number of projects.
+    rows = df.head(max_projects) if max_projects is not None else df
+    for _, row in rows.iterrows():
+        project_name = row["Tool"]
+        difficulty = row["Difficulty"]
+        description = generate_project_description(project_name, difficulty)
+        # Add the project description to the markdown file.
+        content += f"## {project_name}\n"
+        content += f"{description}\n\n"
+        time.sleep(sleep_sec)
+    # Write the markdown file.
+    hio.to_file(markdown_path, content)
+
+
+def _parse() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL"
+    )
+    parser.add_argument(
+        "--secret_path",
+        default="/app/DATA605/google_secret.json",
+        help="Path to Google service‑account JSON.",
+    )
+    parser.add_argument(
+        "--markdown_path",
+        default=DEFAULT_MARKDOWN_PATH,
+        help="Output Markdown file",
+    )
+    parser.add_argument(
+        "--max_projects",
+        type=int,
+        default=DEFAULT_MAX_PROJECTS,
+        help="Limit rows processed (None = all).",
+    )
+    parser.add_argument(
+    "--openai_key",
+    type=str,
+    default=None,
+    help="OpenAI API key (will override env var)",
+)
+    hparser.add_verbosity_arg(parser)  # adds -v / --log_level
+    return parser
+
+
+def _main(parser: argparse.ArgumentParser) -> None:
+    args = parser.parse_args()
+    hdbg.init_logger(verbosity=args.log_level, use_exec_path=True)
+    # Expand user/relative paths to absolute ones early to avoid surprises.
+    secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve())
+    markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve())
+    _LOG.info("Reading sheet %s", args.sheet_url)
+    sheet_df = read_google_sheet(args.sheet_url, secret_path)
+    _LOG.info("Generating Markdown → %s", markdown_path)
+    create_markdown_file(
+        sheet_df,
+        markdown_path,
+        args.max_projects,
+    )
+    _LOG.info("Done: %s", markdown_path)
+
+
+if __name__ == "__main__":
+    _main(_parse())
diff --git a/DATA605/test_project_description.py b/DATA605/test_project_description.py
@@ -0,0 +1,46 @@
+import pandas as pd
+from unittest import mock
+import DATA605.project_description as projdesc
+import helpers.hunit_test as hunitest
+import pytest
+
+class TestProjectDescriptionWithCache(hunitest.TestCase):
+
+    @pytest.fixture(autouse=True)
+    def setup_teardown(self):
+        # Can initialize dummy cache or files here
+        yield
+        # Avoid triggering _GLOBAL_CAPSYS in tearDown
+
+    def test_read_google_sheet(self) -> None:
+        url = "https://docs.google.com/fake-sheet-url"
+        secret_path = "/fake/path/to/secret.json"
+        mock_data = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]})
+
+        with mock.patch("helpers_root.helpers.hgoogle_drive_api.get_credentials"), \
+             mock.patch("helpers_root.helpers.hgoogle_drive_api.read_google_file", return_value=mock_data):
+            df = projdesc.read_google_sheet(url, secret_path)
+            self.assertIsInstance(df, pd.DataFrame)
+            self.assertEqual(df.shape[0], 1)
+
+    def test_generate_project_description(self) -> None:
+        tech = "Kafka"
+        difficulty = "2"
+        mock_output = "Title: Kafka Project\nDifficulty: 2\n..."
+
+        with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output):
+            desc = projdesc.generate_project_description(tech, difficulty)
+            self.assertIn("Kafka", desc)
+            self.assertIn("Difficulty", desc)
+
+    def test_create_markdown_file(self) -> None:
+        df = pd.DataFrame({"Tool": ["Kafka"], "Difficulty": ["2"]})
+        markdown_path = "/tmp/test_projects.md"
+        mock_output = "Title: Kafka Project\nDifficulty: 2\n..."
+
+        with mock.patch("helpers_root.helpers.hopenai.get_completion", return_value=mock_output), \
+             mock.patch("helpers_root.helpers.hio.to_file") as mock_to_file:
+            projdesc.create_markdown_file(df, markdown_path, max_projects=1, sleep_sec=0)
+            mock_to_file.assert_called_once()
+            written_content = mock_to_file.call_args[0][1]
+            self.assertIn("Kafka", written_content)
diff --git a/docs/project_description_explanation.md b/docs/project_description_explanation.md
@@ -0,0 +1,76 @@
+# Explanation: `project_description.py`
+
+<!-- toc -->
+
+- [Introduction and motivation](#introduction-and-motivation)
+- [Core Concepts](#core-concepts)
+- [How It Works](#how-it-works)
+- [Design Rationale](#design-rationale)
+- [Trade-offs and Alternatives](#trade-offs-and-alternatives)
+
+<!-- tocstop -->
+
+## Introduction and motivation
+
+- This tool automates the generation of academic project descriptions by
+  integrating Google Sheets input with OpenAI API.
+- It addresses the need for scalable, consistent, and high-quality project
+  documentation based on dynamic student or faculty input.
+- It is intended to streamline and automate project generation and
+  documentation.
+
+## Core Concepts
+
+- **Google Sheets Integration:** Uses Google Sheets as the dynamic data source
+  for project names and difficulty levels.
+- **Prompt Engineering:** A pre-defined prompt template guides GPT to produce
+  structured project descriptions.
+- **Markdown Generation:** Outputs the generated content into a formatted
+  Markdown file for easy distribution.
+- **Helper Modules:** External utility modules (`hgoogle_file_api`, `hopenai`,
+  `hio`) abstract authentication, I/O, and API interaction.
+
+## How It Works
+
+- The script follows this control flow:
+
+  ```markdown
+  [Google Sheet URL] → read_google_sheet() → [DataFrame of projects] → loop →
+  Create prompt and feed into GPT → [GPT-generated text] → create_markdown_file()
+  → [Markdown output]
+  ```
+
+- Key Functions:
+  - `read_google_sheet(url)`: Reads spreadsheet and returns a pandas DataFrame.
+  - `generate_project_description(project_name, difficulty)`: Sends input to
+    GPT-4o-mini model and returns generated text.
+  - `create_markdown_file(df, markdown_file_path)`: Iterates over the DataFrame,
+    generates description for each row, and writes it to a Markdown file.
+
+## Design Rationale
+
+- **Automation Focus:** Built to minimize manual work for faculty managing large
+  project datasets.
+- **Modular Helpers:** Offloading I/O and API logic to separate modules makes
+  this script easier to maintain or port.
+- **GPT as Content Generator:** Using GPT-4o-mini allows flexibility and
+  high-quality text output with minimal prompt tuning.
+
+## Trade-offs and Alternatives
+
+- **Current Approach:**
+  - Advantages:
+    - Automated, reproducible, and scalable.
+    - Maintains separation of logic (reading input, generating content, writing
+      file).
+  - Drawbacks:
+    - Dependent on OpenAI and Google APIs (connectivity and API keys required).
+    - Limited error handling and logging for individual failures.
+
+- **Alternative Approach:**
+  - Using a GUI-based application or Jupyter notebook for manual review and
+    editing.
+    - Advantages:
+      - Allows user customization and validation at each step.
+    - Drawbacks:
+      - Slower and less scalable; not suitable for batch generation.