|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +Generate project descriptions from a Google Sheet and save them to a Markdown |
| 4 | +file. |
| 5 | +
|
| 6 | +> project_description.py \ |
| 7 | + --sheet_url "https://docs.google.com/spreadsheets/d/1abc...gid=0" \ |
| 8 | + --markdown_path ./projects/MSML610_Projects.md \ |
| 9 | + --max_projects 3 \ |
| 10 | + -v INFO |
| 11 | +
|
| 12 | +Import as: |
| 13 | +
|
| 14 | +import DATA605.project_description as dprodesc |
| 15 | +""" |
| 16 | + |
| 17 | +import argparse |
| 18 | +import logging |
| 19 | +import pathlib |
| 20 | +import time |
| 21 | +from typing import Any, Optional |
| 22 | + |
| 23 | +import pandas as pd |
| 24 | + |
| 25 | +import helpers_root.helpers.hdbg as hdbg |
| 26 | +import helpers_root.helpers.hgoogle_drive_api as hgofiapi |
| 27 | +import helpers_root.helpers.hio as hio |
| 28 | +import helpers_root.helpers.hopenai as hopenai |
| 29 | +import helpers_root.helpers.hparser as hparser |
| 30 | + |
| 31 | +_LOG = logging.getLogger(__name__) |
| 32 | + |
| 33 | +# Set Constants. |
| 34 | +if True: |
| 35 | + DEFAULT_SHEET_URL = ( |
| 36 | + "https://docs.google.com/" |
| 37 | + "spreadsheets/d/" |
| 38 | + "1Ez5uRvOgvDMkFc9c6mI21kscTKnpiCSh4UkUh_ifLIw/" |
| 39 | + "edit?gid=0#gid=0" |
| 40 | + ) |
| 41 | + # Set to True to use the actual spreadsheet link |
| 42 | +else: |
| 43 | + # Set to False for testing purposes |
| 44 | + fake_url = "https://docs.google.com/fake-sheet-url" |
| 45 | + DEFAULT_SHEET_URL = fake_url |
| 46 | +GLOBAL_PROMPT = """Act as a data science professor. |
| 47 | +I will give you a tool (XYZ) and difficulty level (1–3). |
| 48 | +Write a short bullet-point project brief on how XYZ can be |
| 49 | +used for real-time Bitcoin data ingestion in Python. |
| 50 | +Include: |
| 51 | +
|
| 52 | +- Title |
| 53 | +- Difficulty (1 means easy, should take around 7 days to develop, 2 is medium difficulty, should take around 10 days to complete, 3 is hard,should take 14 days to complete) |
| 54 | +- Tech Description |
| 55 | +- Project Idea |
| 56 | +- Python libs |
| 57 | +- Is it Free? |
| 58 | +- Relevant tool(XYZ) related Resource Links |
| 59 | +
|
| 60 | +Avoid long texts or steps |
| 61 | +""" |
| 62 | +EXAMPLE = """Example: |
| 63 | +Title: Ingest bitcoin prices using AWS Glue (AWS Glue is technology XYZ) |
| 64 | +Difficulty: 1 |
| 65 | +Description |
| 66 | +AWS Glue is a fully managed extract, transform, and load (ETL) service... |
| 67 | +Useful resources: AWS Glue Docs |
| 68 | +Is it free?: Free tier available with limits |
| 69 | +Python libraries: boto3, PySpark |
| 70 | +""" |
| 71 | +DEFAULT_MARKDOWN_PATH = "./projects/MSML610_Projects.md" |
| 72 | +# The maximum number of projects. |
| 73 | +# Set the value to None to disable the limit. |
| 74 | +DEFAULT_MAX_PROJECTS = None |
| 75 | + |
| 76 | + |
| 77 | +def _read_google_sheet(url: str, secret_path: str) -> pd.DataFrame: |
| 78 | + """ |
| 79 | + Read the Google Sheet and return the data as a pandas DataFrame. |
| 80 | +
|
| 81 | + :param url: the URL of the Google Sheet to read |
| 82 | + :param secret_path: path to google_secret.json |
| 83 | + :return: the data |
| 84 | + """ |
| 85 | + _LOG.info("Reading Google Sheet %s: ", url) |
| 86 | + _LOG.info("Using credentials from: %s", secret_path) |
| 87 | + credentials = hgofiapi.get_credentials(service_key_path=secret_path) |
| 88 | + df = hgofiapi.read_google_file(url, credentials=credentials) |
| 89 | + return df |
| 90 | + |
| 91 | + |
| 92 | +def _generate_project_description(project_name: str, difficulty: str) -> Any: |
| 93 | + """ |
| 94 | + Generate a project description. |
| 95 | +
|
| 96 | + :param project_name: the name of the project |
| 97 | + :param difficulty: the difficulty level of the project |
| 98 | + :return: the project description |
| 99 | + """ |
| 100 | + if False: |
| 101 | + # Potential (v3) prompt if needed to use. |
| 102 | + # Change False to True to use it. |
| 103 | + prompt = ( |
| 104 | + f"Write a professional and detailed project description" |
| 105 | + f"for a data project titled '{project_name}'. " |
| 106 | + f"Indicate the difficulty level as '{difficulty}', and include objectives, " |
| 107 | + f"technologies used, and expected outcomes." |
| 108 | + ) |
| 109 | + # Will use more tokens, but might help produce a better result. |
| 110 | + elif False: |
| 111 | + # v1 (Original) prompt. |
| 112 | + # Change False to True to use it. |
| 113 | + prompt = ( |
| 114 | + f"Generate a project description for '{project_name}'," |
| 115 | + f"with difficulty level '{difficulty}'." |
| 116 | + ) |
| 117 | + else: |
| 118 | + # v2: Added by Aayush as an improvement to optimize tokens |
| 119 | + # while conveying the same information. |
| 120 | + prompt = f"Technology: {project_name}\nDifficulty: {difficulty}" |
| 121 | + # Short, to the point and concise. Saves the most tokens while achieving similar results. |
| 122 | + project_desc = hopenai.get_completion( |
| 123 | + prompt, |
| 124 | + system_prompt=GLOBAL_PROMPT, |
| 125 | + model="gpt-4o-mini", |
| 126 | + cache_mode="FALLBACK", |
| 127 | + temperature=0.3, |
| 128 | + max_tokens=400, |
| 129 | + print_cost=True, |
| 130 | + ) |
| 131 | + return project_desc |
| 132 | + |
| 133 | + |
| 134 | +def create_markdown_file( |
| 135 | + df: pd.DataFrame, |
| 136 | + markdown_path: str, |
| 137 | + max_projects: Optional[int], |
| 138 | + *, |
| 139 | + sleep_sec: float = 1.5, |
| 140 | +) -> None: |
| 141 | + """ |
| 142 | + Create a markdown file with the project descriptions using helpers.hio. |
| 143 | +
|
| 144 | + :param df: the dataframe containing the project descriptions |
| 145 | + :param markdown_path: the path to the markdown file |
| 146 | + :param max_projects: limit to the rows processed |
| 147 | + :param sleep_sec: amount of time to sleep between rows |
| 148 | + """ |
| 149 | + content = "# MSML610 Projects\n\n" |
| 150 | + # Generate the project descriptions. |
| 151 | + # Limit the number of projects. |
| 152 | + rows = df.head(max_projects) if max_projects is not None else df |
| 153 | + for _, row in rows.iterrows(): |
| 154 | + project_name = row["Tool"] |
| 155 | + difficulty = row["Difficulty"] |
| 156 | + description = _generate_project_description(project_name, difficulty) |
| 157 | + # Add the project description to the markdown file. |
| 158 | + content += f"## {project_name}\n" |
| 159 | + content += f"{description}\n\n" |
| 160 | + # Letting it wait for a while before triggering another request |
| 161 | + time.sleep(sleep_sec) |
| 162 | + # Write the markdown file. |
| 163 | + hio.to_file(markdown_path, content) |
| 164 | + |
| 165 | + |
| 166 | +def _parse() -> argparse.ArgumentParser: |
| 167 | + parser = argparse.ArgumentParser( |
| 168 | + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter |
| 169 | + ) |
| 170 | + parser.add_argument( |
| 171 | + "--sheet_url", default=DEFAULT_SHEET_URL, help="Google Sheet URL" |
| 172 | + ) |
| 173 | + parser.add_argument( |
| 174 | + "--secret_path", |
| 175 | + default="/app/DATA605/google_secret.json", |
| 176 | + help="Path to Google service‑account JSON.", |
| 177 | + ) |
| 178 | + parser.add_argument( |
| 179 | + "--markdown_path", |
| 180 | + default=DEFAULT_MARKDOWN_PATH, |
| 181 | + help="Output Markdown file", |
| 182 | + ) |
| 183 | + parser.add_argument( |
| 184 | + "--max_projects", |
| 185 | + type=int, |
| 186 | + default=DEFAULT_MAX_PROJECTS, |
| 187 | + help="Limit rows processed (None = all).", |
| 188 | + ) |
| 189 | + parser.add_argument( |
| 190 | + "--openai_key", |
| 191 | + type=str, |
| 192 | + default=None, |
| 193 | + help="OpenAI API key (will override env var)", |
| 194 | + ) |
| 195 | + hparser.add_verbosity_arg(parser) |
| 196 | + return parser |
| 197 | + |
| 198 | + |
| 199 | +def _main(parser: argparse.ArgumentParser) -> None: |
| 200 | + args = parser.parse_args() |
| 201 | + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) |
| 202 | + # Expand user/relative paths to absolute ones early to avoid surprises. |
| 203 | + secret_path = str(pathlib.Path(args.secret_path).expanduser().resolve()) |
| 204 | + markdown_path = str(pathlib.Path(args.markdown_path).expanduser().resolve()) |
| 205 | + _LOG.info("Reading sheet %s", args.sheet_url) |
| 206 | + sheet_df = _read_google_sheet(args.sheet_url, secret_path) |
| 207 | + _LOG.info("Generating Markdown → %s", markdown_path) |
| 208 | + create_markdown_file( |
| 209 | + sheet_df, |
| 210 | + markdown_path, |
| 211 | + args.max_projects, |
| 212 | + ) |
| 213 | + _LOG.info("Done: %s", markdown_path) |
| 214 | + |
| 215 | + |
| 216 | +if __name__ == "__main__": |
| 217 | + _main(_parse()) |
0 commit comments