-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfilter_docs.py
More file actions
155 lines (131 loc) · 5.41 KB
/
filter_docs.py
File metadata and controls
155 lines (131 loc) · 5.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Use this script to create a clone of the /docs
# directory with codeblocks of only your
# language of choice.
import argparse
import shutil
import os
# Languages that are considered mutually exclusive "request" languages.
# If we keep one of these, we drop the others (unless they are the fallback).
EXCLUSIVE_LANGS = {
'ruby', 'python', 'php', 'java', 'node', 'go', 'dotnet', 'curl', 'cli', 'csharp'
}
def get_block_lang(line):
"""
Returns the language of a code block start line, or None if not a start line.
Returns '' for empty code blocks (```).
"""
stripped = line.lstrip()
if stripped.startswith('```'):
# It's a block boundary.
# Check if it has a language specified
parts = stripped.split('```', 1)
if len(parts) > 1:
return parts[1].strip().lower()
return ''
return None
def get_file_languages(filepath):
"""
Scans a markdown file and returns a set of all languages found in code blocks.
"""
langs = set()
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
in_block = False
for line in f:
stripped = line.lstrip()
if stripped.startswith('```'):
if not in_block:
# Start of block
lang = get_block_lang(line)
if lang:
langs.add(lang)
in_block = True
else:
# End of block
in_block = False
return langs
def filter_file_content(filepath, target_langs):
"""
Reads the file, keeps only the appropriate code blocks, and overwrites the file.
target_langs: a list of strings (e.g., ['python', 'ruby'])
"""
# 1. Identify available languages
available_langs = get_file_languages(filepath)
# Normalize target languages
target_lower_list = [l.lower() for l in target_langs]
# 2. Determine Keep Strategy
keep_langs = set()
# Strategy:
# - non-exclusive languages (json, text, etc) are ALWAYS kept.
# - for each requested language in target_lower_list, if it is present in the file, keep it.
# - if NO requested language is present, check for 'curl'.
# - if 'curl' is present, keep 'curl' (fallback).
# - otherwise, keep none of the exclusive languages.
any_target_found = False
for t_lang in target_lower_list:
if t_lang in available_langs:
keep_langs.add(t_lang)
any_target_found = True
if not any_target_found and 'curl' in available_langs:
# Fallback to curl if no target found
keep_langs.add('curl')
# Read content
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
filtered_lines = []
in_block = False
skipping = False
for line in lines:
stripped = line.lstrip()
if stripped.startswith('```'):
if not in_block:
# START of block
lang = get_block_lang(line)
# Check if this block should be skipped
# It is skipped if it is in EXCLUSIVE_LANGS AND not in keep_langs
if lang in EXCLUSIVE_LANGS:
if lang in keep_langs:
skipping = False
else:
skipping = True
else:
# Not an exclusive lang (e.g. json, text), always keep
skipping = False
if not skipping:
filtered_lines.append(line)
in_block = True
else:
# END of block
if not skipping:
filtered_lines.append(line)
in_block = False
skipping = False # Reset for next text
else:
if not skipping:
filtered_lines.append(line)
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
f.writelines(filtered_lines)
def process_directory(source_dir, dest_dir, target_langs):
# 1. Copy entire directory structure
if os.path.exists(dest_dir):
print(f"Destination {dest_dir} already exists. Removing it to start fresh...")
shutil.rmtree(dest_dir)
print(f"Copying {source_dir} to {dest_dir}...")
shutil.copytree(source_dir, dest_dir)
# 2. Walk through destination and filter .md files
print(f"Filtering markdown files for languages: {target_langs}")
count = 0
for root, dirs, files in os.walk(dest_dir):
for file in files:
if file.endswith('.md'):
filepath = os.path.join(root, file)
filter_file_content(filepath, target_langs)
count += 1
print(f"Processed {count} markdown files.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Copy docs and filter code blocks by language.")
parser.add_argument("--source", "-s", required=True, help="Source directory (e.g. docs)")
parser.add_argument("--destination", "-d", required=True, help="Destination directory (e.g. docs_python)")
parser.add_argument("--languages", "-l", nargs='+', required=True, help="Target languages (e.g. python ruby)")
args = parser.parse_args()
process_directory(args.source, args.destination, args.languages)