-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_docs.py
More file actions
241 lines (200 loc) · 8.57 KB
/
filter_docs.py
File metadata and controls
241 lines (200 loc) · 8.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Use this script to create a clone of the /docs or /api_reference
# directory with code snippets of only your
# language of choice.
#
# Gemini docs use heading-based language sections (### Python, ### JavaScript, etc.)
# with indented code blocks, rather than fenced code blocks with language tags.
import argparse
import shutil
import os
import re
# Default language(s) to filter for when not specified
DEFAULT_LANGUAGES = ['python', 'rest']
# Languages that are considered mutually exclusive "request" languages.
# If we keep one of these, we drop the others (unless they are the fallback).
# These correspond to the ### headings used in the Gemini docs.
EXCLUSIVE_LANGS = {
'python', 'javascript', 'go', 'java', 'c#', 'rest', 'apps script'
}
# Mapping of common aliases to canonical heading names
LANG_ALIASES = {
'py': 'python',
'js': 'javascript',
'node': 'javascript',
'node.js': 'javascript',
'typescript': 'javascript',
'ts': 'javascript',
'golang': 'go',
'curl': 'rest',
'csharp': 'c#',
'dotnet': 'c#',
'.net': 'c#',
'apps_script': 'apps script',
'appsscript': 'apps script',
}
def normalize_lang(lang):
"""Normalize a language name to its canonical form."""
lower = lang.lower().strip()
return LANG_ALIASES.get(lower, lower)
def get_heading_lang(line):
"""
Returns the language of a heading line (### Language), or None if not a heading.
Only matches level-3 headings (###) that correspond to known languages.
"""
stripped = line.strip()
match = re.match(r'^###\s+(.+)$', stripped)
if match:
heading_text = match.group(1).strip()
normalized = normalize_lang(heading_text)
if normalized in EXCLUSIVE_LANGS:
return normalized
return None
def get_file_languages(filepath):
"""
Scans a markdown file and returns a set of all languages found as ### headings.
"""
langs = set()
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
lang = get_heading_lang(line)
if lang:
langs.add(lang)
return langs
def filter_file_content(filepath, target_langs):
"""
Reads the file, keeps only the appropriate language sections, and overwrites the file.
target_langs: a list of strings (e.g., ['python', 'rest'])
Gemini docs format:
- Language sections start with ### <Language> headings
- Code is in indented blocks (4 spaces) below the heading
- A section ends when the next heading of equal or higher level is found,
or a non-indented, non-empty line that isn't part of the code block
"""
# 1. Identify available languages
available_langs = get_file_languages(filepath)
# Normalize target languages
target_lower_list = [normalize_lang(l) for l in target_langs]
# 2. Determine Keep Strategy
keep_langs = set()
# Strategy:
# - for each requested language in target_lower_list, if present in file, keep it.
# - if NO requested language is present, check for 'rest' (curl).
# - if 'rest' is present, keep 'rest' (fallback).
# - otherwise, keep none of the exclusive languages.
any_target_found = False
for t_lang in target_lower_list:
if t_lang in available_langs:
keep_langs.add(t_lang)
any_target_found = True
if not any_target_found and 'rest' in available_langs:
# Fallback to REST/curl if no target found
keep_langs.add('rest')
# Read content
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
filtered_lines = []
skipping = False
current_lang_section = None
for i, line in enumerate(lines):
lang = get_heading_lang(line)
if lang is not None:
# This is a language heading
if lang in EXCLUSIVE_LANGS:
if lang in keep_langs:
skipping = False
current_lang_section = lang
else:
skipping = True
current_lang_section = lang
else:
skipping = False
current_lang_section = None
if not skipping:
filtered_lines.append(line)
continue
# Check if we hit a non-language heading (##, ###, etc) which ends the current section
# Markdown headings have at most 3 spaces before the #
if re.match(r'^\s{0,3}#+\s+', line):
# Any heading ends the current language section
skipping = False
current_lang_section = None
filtered_lines.append(line)
continue
if not skipping:
filtered_lines.append(line)
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
f.writelines(filtered_lines)
def process_directory(source_dir, dest_dir, target_langs):
# If dest_dir is None, filter in-place
if dest_dir is None:
if os.path.isfile(source_dir):
# Filter single file in-place
if source_dir.endswith('.md'):
print(f"Filtering {source_dir} in-place...")
filter_file_content(source_dir, target_langs)
print("Processed 1 markdown file.")
else:
print("Source file is not a markdown file; no filtering applied.")
else:
# Filter directory in-place
print(f"Filtering markdown files in {source_dir} in-place for languages: {target_langs}")
count = 0
for root, dirs, files in os.walk(source_dir):
for file in files:
if file.endswith('.md'):
filepath = os.path.join(root, file)
filter_file_content(filepath, target_langs)
count += 1
print(f"Processed {count} markdown files.")
return
# If source is a file, copy just that file and filter it.
if os.path.isfile(source_dir):
# Determine destination file path
if os.path.isdir(dest_dir):
dest_file = os.path.join(dest_dir, os.path.basename(source_dir))
else:
dest_parent = os.path.dirname(dest_dir)
if dest_parent:
os.makedirs(dest_parent, exist_ok=True)
dest_file = dest_dir
if os.path.exists(dest_file):
print(f"Destination {dest_file} already exists. Overwriting...")
else:
print(f"Copying file {source_dir} to {dest_file}...")
shutil.copy2(source_dir, dest_file)
if dest_file.endswith('.md'):
filter_file_content(dest_file, target_langs)
print("Processed 1 markdown file.")
else:
print("Source file copied but not a markdown file; no filtering applied.")
return
# Otherwise assume source is a directory: copy entire directory structure
if os.path.exists(dest_dir):
print(f"Destination {dest_dir} already exists. Removing it to start fresh...")
shutil.rmtree(dest_dir)
print(f"Copying {source_dir} to {dest_dir}...")
shutil.copytree(source_dir, dest_dir)
# Walk through destination and filter .md files
print(f"Filtering markdown files for languages: {target_langs}")
count = 0
for root, dirs, files in os.walk(dest_dir):
for file in files:
if file.endswith('.md'):
filepath = os.path.join(root, file)
filter_file_content(filepath, target_langs)
count += 1
print(f"Processed {count} markdown files.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Copy Gemini docs and filter code snippets by language.")
# Positional source allows calling: python filter_docs.py docs/file.md
parser.add_argument('pos_source', nargs='?', help='Positional source file or directory (optional alternative to -s)')
parser.add_argument("--source", "-s", dest='opt_source', help="Source file or directory (e.g. docs or docs/file.md)")
parser.add_argument("--destination", "-d", default=None, help="Destination file or directory (optional; if not provided, filters in-place)")
parser.add_argument("--languages", "-l", nargs='+', default=DEFAULT_LANGUAGES, help=f"Target languages (e.g. python javascript). Default: {DEFAULT_LANGUAGES}")
args = parser.parse_args()
# Choose source: positional overrides if present, otherwise use -s/--source
source = args.pos_source if args.pos_source else args.opt_source
if not source:
parser.error('a source is required (positional or --source/-s)')
process_directory(source, args.destination, args.languages)