|
| 1 | +# FILE: content_fetcher.py |
| 2 | +import os |
| 3 | +import re |
| 4 | +import requests |
| 5 | +import subprocess |
| 6 | +from pathlib import Path |
| 7 | +import hashlib |
| 8 | +import logging |
| 9 | + |
| 10 | +try: |
| 11 | + import boto3 |
| 12 | + from botocore.exceptions import NoCredentialsError, ClientError |
| 13 | + boto3_available = True |
| 14 | +except ImportError: |
| 15 | + boto3_available = False |
| 16 | + |
| 17 | +try: |
| 18 | + from google.cloud import storage |
| 19 | + from google.api_core.exceptions import GoogleAPIError |
| 20 | + gcs_available = True |
| 21 | +except ImportError: |
| 22 | + gcs_available = False |
| 23 | + |
| 24 | +class ContentFetcher: |
| 25 | + def __init__(self, cache_dir=None): |
| 26 | + self.logger = logging.getLogger(__name__) |
| 27 | + self.cache_dir = Path(cache_dir or os.path.expanduser("~/.struct/cache")) |
| 28 | + self.cache_dir.mkdir(parents=True, exist_ok=True) |
| 29 | + |
| 30 | + def fetch_content(self, content_location): |
| 31 | + """ |
| 32 | + Fetch content from a given location. Supported protocols: |
| 33 | + - Local file (file://) |
| 34 | + - HTTP/HTTPS (https://) |
| 35 | + - GitHub repository (github://owner/repo/branch/file_path) |
| 36 | + - GitHub HTTPS (githubhttps://owner/repo/branch/file_path) |
| 37 | + - GitHub SSH (githubssh://owner/repo/branch/file_path) |
| 38 | + - S3 bucket (s3://bucket_name/key) |
| 39 | + - Google Cloud Storage (gs://bucket_name/key) |
| 40 | + """ |
| 41 | + protocol_map = { |
| 42 | + "file://": self._fetch_local_file, |
| 43 | + "https://": self._fetch_http_url, |
| 44 | + "github://": self._fetch_github_file, |
| 45 | + "githubhttps://": self._fetch_github_https_file, |
| 46 | + "githubssh://": self._fetch_github_ssh_file, |
| 47 | + } |
| 48 | + |
| 49 | + if boto3_available: |
| 50 | + protocol_map["s3://"] = self._fetch_s3_file |
| 51 | + if gcs_available: |
| 52 | + protocol_map["gs://"] = self._fetch_gcs_file |
| 53 | + |
| 54 | + for prefix, method in protocol_map.items(): |
| 55 | + if content_location.startswith(prefix): |
| 56 | + return method(content_location[len(prefix):]) |
| 57 | + |
| 58 | + raise ValueError(f"Unsupported content location: {content_location}") |
| 59 | + |
| 60 | + def _fetch_local_file(self, file_path): |
| 61 | + self.logger.debug(f"Fetching content from local file: {file_path}") |
| 62 | + file_path = Path(file_path) |
| 63 | + with file_path.open('r') as file: |
| 64 | + return file.read() |
| 65 | + |
| 66 | + def _fetch_http_url(self, url): |
| 67 | + self.logger.debug(f"Fetching content from URL: {url}") |
| 68 | + # Create a hash of the URL to use as a cache key |
| 69 | + cache_key = hashlib.md5(url.encode()).hexdigest() |
| 70 | + cache_file_path = self.cache_dir / cache_key |
| 71 | + |
| 72 | + if cache_file_path.exists(): |
| 73 | + self.logger.debug(f"Loading content from cache: {cache_file_path}") |
| 74 | + with cache_file_path.open('r') as file: |
| 75 | + return file.read() |
| 76 | + |
| 77 | + response = requests.get(url) |
| 78 | + response.raise_for_status() |
| 79 | + with cache_file_path.open('w') as file: |
| 80 | + file.write(response.text) |
| 81 | + |
| 82 | + return response.text |
| 83 | + |
| 84 | + def _fetch_github_file(self, github_url): |
| 85 | + """ |
| 86 | + Fetch a file from a GitHub repository using HTTPS. |
| 87 | + Expected format: github://owner/repo/branch/file_path |
| 88 | + """ |
| 89 | + self.logger.debug(f"Fetching content from GitHub: {github_url}") |
| 90 | + match = re.match(r"github://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url) |
| 91 | + if not match: |
| 92 | + raise ValueError("Invalid GitHub URL format. Expected github://owner/repo/branch/file_path") |
| 93 | + |
| 94 | + owner, repo, branch, file_path = match.groups() |
| 95 | + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True) |
| 96 | + |
| 97 | + def _fetch_github_https_file(self, github_url): |
| 98 | + """ |
| 99 | + Fetch a file from a GitHub repository using HTTPS. |
| 100 | + Expected format: githubhttps://owner/repo/branch/file_path |
| 101 | + """ |
| 102 | + self.logger.debug(f"Fetching content from GitHub (HTTPS): {github_url}") |
| 103 | + match = re.match(r"githubhttps://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url) |
| 104 | + if not match: |
| 105 | + raise ValueError("Invalid GitHub URL format. Expected githubhttps://owner/repo/branch/file_path") |
| 106 | + |
| 107 | + owner, repo, branch, file_path = match.groups() |
| 108 | + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True) |
| 109 | + |
| 110 | + def _fetch_github_ssh_file(self, github_url): |
| 111 | + """ |
| 112 | + Fetch a file from a GitHub repository using SSH. |
| 113 | + Expected format: githubssh://owner/repo/branch/file_path |
| 114 | + """ |
| 115 | + self.logger.debug(f"Fetching content from GitHub (SSH): {github_url}") |
| 116 | + match = re.match(r"githubssh://([^/]+)/([^/]+)/([^/]+)/(.+)", github_url) |
| 117 | + if not match: |
| 118 | + raise ValueError("Invalid GitHub URL format. Expected githubssh://owner/repo/branch/file_path") |
| 119 | + |
| 120 | + owner, repo, branch, file_path = match.groups() |
| 121 | + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=False) |
| 122 | + |
| 123 | + def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True): |
| 124 | + repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}" |
| 125 | + clone_url = f"https://github.com/{owner}/{repo}.git" if https else f"git@github.com:{owner}/{repo}.git" |
| 126 | + |
| 127 | + # Clone or fetch the repository |
| 128 | + if not repo_cache_path.exists(): |
| 129 | + self.logger.debug(f"Cloning repository: {owner}/{repo} (branch: {branch})") |
| 130 | + subprocess.run(["git", "clone", "-b", branch, clone_url, str(repo_cache_path)], check=True) |
| 131 | + else: |
| 132 | + self.logger.debug(f"Repository already cloned. Pulling latest changes for: {repo_cache_path}") |
| 133 | + subprocess.run(["git", "-C", str(repo_cache_path), "pull"], check=True) |
| 134 | + |
| 135 | + # Read the requested file |
| 136 | + file_full_path = repo_cache_path / file_path |
| 137 | + if not file_full_path.exists(): |
| 138 | + raise FileNotFoundError(f"File {file_path} not found in repository {owner}/{repo} on branch {branch}") |
| 139 | + |
| 140 | + with file_full_path.open('r') as file: |
| 141 | + return file.read() |
| 142 | + |
| 143 | + def _fetch_s3_file(self, s3_url): |
| 144 | + """ |
| 145 | + Fetch a file from an S3 bucket. |
| 146 | + Expected format: s3://bucket_name/key |
| 147 | + """ |
| 148 | + if not boto3_available: |
| 149 | + raise ImportError("boto3 is not installed. Please install it to use S3 fetching.") |
| 150 | + |
| 151 | + self.logger.debug(f"Fetching content from S3: {s3_url}") |
| 152 | + match = re.match(r"s3://([^/]+)/(.+)", s3_url) |
| 153 | + if not match: |
| 154 | + raise ValueError("Invalid S3 URL format. Expected s3://bucket_name/key") |
| 155 | + |
| 156 | + bucket_name, key = match.groups() |
| 157 | + local_file_path = self.cache_dir / Path(key).name |
| 158 | + |
| 159 | + try: |
| 160 | + session = boto3.Session() # Create a new session |
| 161 | + s3_client = session.client("s3") |
| 162 | + s3_client.download_file(bucket_name, key, str(local_file_path)) |
| 163 | + self.logger.debug(f"Downloaded S3 file to: {local_file_path}") |
| 164 | + except NoCredentialsError: |
| 165 | + raise RuntimeError("AWS credentials not found. Ensure that your credentials are configured properly.") |
| 166 | + except ClientError as e: |
| 167 | + error_code = e.response.get("Error", {}).get("Code") |
| 168 | + if error_code == "404": |
| 169 | + raise FileNotFoundError(f"The specified S3 key does not exist: {key}") |
| 170 | + else: |
| 171 | + raise RuntimeError(f"Failed to download S3 file: {e}") |
| 172 | + |
| 173 | + with local_file_path.open('r') as file: |
| 174 | + return file.read() |
| 175 | + |
| 176 | + def _fetch_gcs_file(self, gcs_url): |
| 177 | + """ |
| 178 | + Fetch a file from Google Cloud Storage. |
| 179 | + Expected format: gs://bucket_name/key |
| 180 | + """ |
| 181 | + if not gcs_available: |
| 182 | + raise ImportError("google-cloud-storage is not installed. Please install it to use GCS fetching.") |
| 183 | + |
| 184 | + self.logger.debug(f"Fetching content from GCS: {gcs_url}") |
| 185 | + match = re.match(r"gs://([^/]+)/(.+)", gcs_url) |
| 186 | + if not match: |
| 187 | + raise ValueError("Invalid GCS URL format. Expected gs://bucket_name/key") |
| 188 | + |
| 189 | + bucket_name, key = match.groups() |
| 190 | + local_file_path = self.cache_dir / Path(key).name |
| 191 | + |
| 192 | + try: |
| 193 | + gcs_client = storage.Client() |
| 194 | + bucket = gcs_client.bucket(bucket_name) |
| 195 | + blob = bucket.blob(key) |
| 196 | + blob.download_to_filename(str(local_file_path)) |
| 197 | + self.logger.debug(f"Downloaded GCS file to: {local_file_path}") |
| 198 | + except GoogleAPIError as e: |
| 199 | + raise RuntimeError(f"Failed to download GCS file: {e}") |
| 200 | + |
| 201 | + with local_file_path.open('r') as file: |
| 202 | + return file.read() |
0 commit comments