Skip to content

Commit d9722c8

Browse files
authored
Add Content Fetcher for Multiple Protocols and Improve Logging (#23)
#### Overview: This pull request introduces a `ContentFetcher` module, expanding the library's capabilities to fetch content from various protocols like local files, HTTP(S), GitHub repositories, S3, and Google Cloud Storage. It also enhances the logging configuration for better debugging and monitoring. #### Changes: - **`content_fetcher.py`**: - New module to handle content fetching across multiple protocols, with support for caching and error handling. - **`README.md` and `README.es.md`**: - Updated to document examples of using the `ContentFetcher` for different content locations. - **`requirements.txt`**: - Added dependencies: `boto3`, `google-cloud`, and `google-api-core`. - **`file_item.py`**: - Integrated `ContentFetcher` for streamlined content retrieval. - Removed redundant fetch logic. - **`logging_config.py`**: - Enhanced logging format to include file names and line numbers in debug mode. #### Justification: The addition of `ContentFetcher` simplifies content retrieval and extends the library's utility by supporting multiple widely-used protocols. Improved logging aids developers in debugging and monitoring the application's execution. #### Impact: - Users can now fetch content seamlessly from various sources without writing custom logic for each protocol. - Enhanced logging improves development and production diagnostics. - Minor increase in dependencies for cloud storage support.
1 parent 9c18d9b commit d9722c8

6 files changed

Lines changed: 243 additions & 17 deletions

File tree

README.es.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,18 @@ structure:
149149
echo "Hello, {{@ author_name @}}!"
150150
- LICENSE:
151151
file: https://raw.githubusercontent.com/nishanths/license/master/LICENSE
152+
- archivo_remoto.txt:
153+
file: file:///ruta/al/archivo/local.txt
154+
- archivo_github.py:
155+
file: github://owner/repo/branch/path/to/file.py
156+
- archivo_github_https.py:
157+
file: githubhttps://owner/repo/branch/path/to/file.py
158+
- archivo_github_ssh.py:
159+
file: githubssh://owner/repo/branch/path/to/file.py
160+
- archivo_s3.txt:
161+
file: s3://bucket_name/key
162+
- archivo_gcs.txt:
163+
file: gs://bucket_name/key
152164
- src/main.py:
153165
content: |
154166
print("Hello, World!")

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,18 @@ structure:
147147
echo "Hello, {{@ author_name @}}!"
148148
- LICENSE:
149149
file: https://raw.githubusercontent.com/nishanths/license/master/LICENSE
150+
- remote_file.txt:
151+
file: file:///path/to/local/file.txt
152+
- github_file.py:
153+
file: github://owner/repo/branch/path/to/file.py
154+
- github_https_file.py:
155+
file: githubhttps://owner/repo/branch/path/to/file.py
156+
- github_ssh_file.py:
157+
file: githubssh://owner/repo/branch/path/to/file.py
158+
- s3_file.txt:
159+
file: s3://bucket_name/key
160+
- gcs_file.txt:
161+
file: gs://bucket_name/key
150162
- src/main.py:
151163
content: |
152164
print("Hello, World!")

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ jinja2
66
PyGithub
77
argcomplete
88
colorlog
9+
boto3
10+
google-cloud
11+
google-api-core

struct_module/content_fetcher.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# FILE: content_fetcher.py
2+
import os
3+
import re
4+
import requests
5+
import subprocess
6+
from pathlib import Path
7+
import hashlib
8+
import logging
9+
10+
try:
11+
import boto3
12+
from botocore.exceptions import NoCredentialsError, ClientError
13+
boto3_available = True
14+
except ImportError:
15+
boto3_available = False
16+
17+
try:
18+
from google.cloud import storage
19+
from google.api_core.exceptions import GoogleAPIError
20+
gcs_available = True
21+
except ImportError:
22+
gcs_available = False
23+
24+
class ContentFetcher:
25+
def __init__(self, cache_dir=None):
26+
self.logger = logging.getLogger(__name__)
27+
self.cache_dir = Path(cache_dir or os.path.expanduser("~/.struct/cache"))
28+
self.cache_dir.mkdir(parents=True, exist_ok=True)
29+
30+
def fetch_content(self, content_location):
31+
"""
32+
Fetch content from a given location. Supported protocols:
33+
- Local file (file://)
34+
- HTTP/HTTPS (https://)
35+
- GitHub repository (github://owner/repo/branch/file_path)
36+
- GitHub HTTPS (githubhttps://owner/repo/branch/file_path)
37+
- GitHub SSH (githubssh://owner/repo/branch/file_path)
38+
- S3 bucket (s3://bucket_name/key)
39+
- Google Cloud Storage (gs://bucket_name/key)
40+
"""
41+
protocol_map = {
42+
"file://": self._fetch_local_file,
43+
"https://": self._fetch_http_url,
44+
"github://": self._fetch_github_file,
45+
"githubhttps://": self._fetch_github_https_file,
46+
"githubssh://": self._fetch_github_ssh_file,
47+
}
48+
49+
if boto3_available:
50+
protocol_map["s3://"] = self._fetch_s3_file
51+
if gcs_available:
52+
protocol_map["gs://"] = self._fetch_gcs_file
53+
54+
for prefix, method in protocol_map.items():
55+
if content_location.startswith(prefix):
56+
return method(content_location[len(prefix):])
57+
58+
raise ValueError(f"Unsupported content location: {content_location}")
59+
60+
def _fetch_local_file(self, file_path):
61+
self.logger.debug(f"Fetching content from local file: {file_path}")
62+
file_path = Path(file_path)
63+
with file_path.open('r') as file:
64+
return file.read()
65+
66+
def _fetch_http_url(self, url):
67+
self.logger.debug(f"Fetching content from URL: {url}")
68+
# Create a hash of the URL to use as a cache key
69+
cache_key = hashlib.md5(url.encode()).hexdigest()
70+
cache_file_path = self.cache_dir / cache_key
71+
72+
if cache_file_path.exists():
73+
self.logger.debug(f"Loading content from cache: {cache_file_path}")
74+
with cache_file_path.open('r') as file:
75+
return file.read()
76+
77+
response = requests.get(url)
78+
response.raise_for_status()
79+
with cache_file_path.open('w') as file:
80+
file.write(response.text)
81+
82+
return response.text
83+
84+
def _fetch_github_file(self, github_url):
85+
"""
86+
Fetch a file from a GitHub repository using HTTPS.
87+
Expected format: github://owner/repo/branch/file_path
88+
"""
89+
self.logger.debug(f"Fetching content from GitHub: {github_url}")
90+
match = re.match(r"github://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url)
91+
if not match:
92+
raise ValueError("Invalid GitHub URL format. Expected github://owner/repo/branch/file_path")
93+
94+
owner, repo, branch, file_path = match.groups()
95+
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
96+
97+
def _fetch_github_https_file(self, github_url):
98+
"""
99+
Fetch a file from a GitHub repository using HTTPS.
100+
Expected format: githubhttps://owner/repo/branch/file_path
101+
"""
102+
self.logger.debug(f"Fetching content from GitHub (HTTPS): {github_url}")
103+
match = re.match(r"githubhttps://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url)
104+
if not match:
105+
raise ValueError("Invalid GitHub URL format. Expected githubhttps://owner/repo/branch/file_path")
106+
107+
owner, repo, branch, file_path = match.groups()
108+
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
109+
110+
def _fetch_github_ssh_file(self, github_url):
111+
"""
112+
Fetch a file from a GitHub repository using SSH.
113+
Expected format: githubssh://owner/repo/branch/file_path
114+
"""
115+
self.logger.debug(f"Fetching content from GitHub (SSH): {github_url}")
116+
match = re.match(r"githubssh://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url)
117+
if not match:
118+
raise ValueError("Invalid GitHub URL format. Expected githubssh://owner/repo/branch/file_path")
119+
120+
owner, repo, branch, file_path = match.groups()
121+
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=False)
122+
123+
def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True):
124+
repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}"
125+
clone_url = f"https://github.com/{owner}/{repo}.git" if https else f"git@github.com:{owner}/{repo}.git"
126+
127+
# Clone or fetch the repository
128+
if not repo_cache_path.exists():
129+
self.logger.debug(f"Cloning repository: {owner}/{repo} (branch: {branch})")
130+
subprocess.run(["git", "clone", "-b", branch, clone_url, str(repo_cache_path)], check=True)
131+
else:
132+
self.logger.debug(f"Repository already cloned. Pulling latest changes for: {repo_cache_path}")
133+
subprocess.run(["git", "-C", str(repo_cache_path), "pull"], check=True)
134+
135+
# Read the requested file
136+
file_full_path = repo_cache_path / file_path
137+
if not file_full_path.exists():
138+
raise FileNotFoundError(f"File {file_path} not found in repository {owner}/{repo} on branch {branch}")
139+
140+
with file_full_path.open('r') as file:
141+
return file.read()
142+
143+
def _fetch_s3_file(self, s3_url):
144+
"""
145+
Fetch a file from an S3 bucket.
146+
Expected format: s3://bucket_name/key
147+
"""
148+
if not boto3_available:
149+
raise ImportError("boto3 is not installed. Please install it to use S3 fetching.")
150+
151+
self.logger.debug(f"Fetching content from S3: {s3_url}")
152+
match = re.match(r"s3://([^/]+)/(.+)", s3_url)
153+
if not match:
154+
raise ValueError("Invalid S3 URL format. Expected s3://bucket_name/key")
155+
156+
bucket_name, key = match.groups()
157+
local_file_path = self.cache_dir / Path(key).name
158+
159+
try:
160+
session = boto3.Session() # Create a new session
161+
s3_client = session.client("s3")
162+
s3_client.download_file(bucket_name, key, str(local_file_path))
163+
self.logger.debug(f"Downloaded S3 file to: {local_file_path}")
164+
except NoCredentialsError:
165+
raise RuntimeError("AWS credentials not found. Ensure that your credentials are configured properly.")
166+
except ClientError as e:
167+
error_code = e.response.get("Error", {}).get("Code")
168+
if error_code == "404":
169+
raise FileNotFoundError(f"The specified S3 key does not exist: {key}")
170+
else:
171+
raise RuntimeError(f"Failed to download S3 file: {e}")
172+
173+
with local_file_path.open('r') as file:
174+
return file.read()
175+
176+
def _fetch_gcs_file(self, gcs_url):
177+
"""
178+
Fetch a file from Google Cloud Storage.
179+
Expected format: gs://bucket_name/key
180+
"""
181+
if not gcs_available:
182+
raise ImportError("google-cloud-storage is not installed. Please install it to use GCS fetching.")
183+
184+
self.logger.debug(f"Fetching content from GCS: {gcs_url}")
185+
match = re.match(r"gs://([^/]+)/(.+)", gcs_url)
186+
if not match:
187+
raise ValueError("Invalid GCS URL format. Expected gs://bucket_name/key")
188+
189+
bucket_name, key = match.groups()
190+
local_file_path = self.cache_dir / Path(key).name
191+
192+
try:
193+
gcs_client = storage.Client()
194+
bucket = gcs_client.bucket(bucket_name)
195+
blob = bucket.blob(key)
196+
blob.download_to_filename(str(local_file_path))
197+
self.logger.debug(f"Downloaded GCS file to: {local_file_path}")
198+
except GoogleAPIError as e:
199+
raise RuntimeError(f"Failed to download GCS file: {e}")
200+
201+
with local_file_path.open('r') as file:
202+
return file.read()

struct_module/file_item.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from openai import OpenAI
88
from dotenv import load_dotenv
99
from struct_module.template_renderer import TemplateRenderer
10+
from struct_module.content_fetcher import ContentFetcher
1011

1112
load_dotenv()
1213

@@ -25,6 +26,8 @@ def __init__(self, properties):
2526
self.input_store = properties.get("input_store")
2627
self.skip = properties.get("skip", False)
2728

29+
self.content_fetcher = ContentFetcher()
30+
2831
self.system_prompt = properties.get("system_prompt") or properties.get("global_system_prompt")
2932
self.user_prompt = properties.get("user_prompt")
3033
self.openai_client = None
@@ -82,22 +85,11 @@ def process_prompt(self, dry_run=False):
8285
def fetch_content(self):
8386
if self.content_location:
8487
self.logger.debug(f"Fetching content from: {self.content_location}")
85-
86-
if self.content_location.startswith("file://"):
87-
file_path = self.content_location[len("file://"):]
88-
with open(file_path, 'r') as file:
89-
self.content = file.read()
90-
self.logger.debug(f"Fetched content from local file: {self.content}")
91-
92-
elif self.content_location.startswith("https://"):
93-
response = requests.get(self.content_location)
94-
self.logger.debug(f"Response status code: {response.status_code}")
95-
response.raise_for_status()
96-
self.content = response.text
97-
self.logger.debug(f"Fetched content from URL: {self.content}")
98-
99-
else:
100-
self.logger.warning(f"Unsupported protocol in content_location: {self.content_location}")
88+
try:
89+
self.content = self.content_fetcher.fetch_content(self.content_location)
90+
self.logger.debug(f"Fetched content: {self.content}")
91+
except Exception as e:
92+
self.logger.error(f"Failed to fetch content from {self.content_location}: {e}")
10193

10294
def _merge_default_template_vars(self, template_vars):
10395
default_vars = {

struct_module/logging_config.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@
55
def configure_logging(level=logging.INFO, log_file=None):
66
"""Configure logging with colorlog."""
77
handler = colorlog.StreamHandler()
8+
9+
line_format = "%(log_color)s[%(levelname)s] >> %(message)s"
10+
if level == logging.DEBUG:
11+
line_format = "%(log_color)s[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)d] >> %(message)s"
12+
813
handler.setFormatter(colorlog.ColoredFormatter(
9-
"%(log_color)s[%(asctime)s][%(levelname)s][struct] >>> %(message)s",
14+
line_format,
1015
datefmt='%Y-%m-%d %H:%M:%S',
1116
log_colors={
1217
'DEBUG': 'cyan',

0 commit comments

Comments
 (0)