stripe_docs_markdown/filter_docs.py at main · arthurauffray/stripe_docs_markdown · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Use this script to create a clone of the /docs
# directory with codeblocks of only your
# language of choice.

import argparse
import shutil
import os

# Languages that are considered mutually exclusive "request" languages.
# If we keep one of these, we drop the others (unless they are the fallback).
EXCLUSIVE_LANGS = {
    'ruby', 'python', 'php', 'java', 'node', 'go', 'dotnet', 'curl', 'cli', 'csharp'
}

def get_block_lang(line):
    """
    Returns the language of a code block start line, or None if not a start line.
    Returns '' for empty code blocks (```).
    """
    stripped = line.lstrip()
    if stripped.startswith('```'):
        # It's a block boundary.
        # Check if it has a language specified
        parts = stripped.split('```', 1)
        if len(parts) > 1:
            return parts[1].strip().lower()
        return ''
    return None

def get_file_languages(filepath):
    """
    Scans a markdown file and returns a set of all languages found in code blocks.
    """
    langs = set()
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        in_block = False
        for line in f:
            stripped = line.lstrip()
            if stripped.startswith('```'):
                if not in_block:
                    # Start of block
                    lang = get_block_lang(line)
                    if lang:
                        langs.add(lang)
                    in_block = True
                else:
                    # End of block
                    in_block = False
    return langs

def filter_file_content(filepath, target_langs):
    """
    Reads the file, keeps only the appropriate code blocks, and overwrites the file.
    target_langs: a list of strings (e.g., ['python', 'ruby'])
    """
    # 1. Identify available languages
    available_langs = get_file_languages(filepath)

    # Normalize target languages
    target_lower_list = [l.lower() for l in target_langs]

    # 2. Determine Keep Strategy
    keep_langs = set()

    # Strategy:
    # - non-exclusive languages (json, text, etc) are ALWAYS kept.
    # - for each requested language in target_lower_list, if it is present in the file, keep it.
    # - if NO requested language is present, check for 'curl'.
    #   - if 'curl' is present, keep 'curl' (fallback).
    #   - otherwise, keep none of the exclusive languages.

    any_target_found = False
    for t_lang in target_lower_list:
        if t_lang in available_langs:
            keep_langs.add(t_lang)
            any_target_found = True

    if not any_target_found and 'curl' in available_langs:
        # Fallback to curl if no target found
        keep_langs.add('curl')

    # Read content
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    filtered_lines = []
    in_block = False
    skipping = False

    for line in lines:
        stripped = line.lstrip()
        if stripped.startswith('```'):
            if not in_block:
                # START of block
                lang = get_block_lang(line)

                # Check if this block should be skipped
                # It is skipped if it is in EXCLUSIVE_LANGS AND not in keep_langs
                if lang in EXCLUSIVE_LANGS:
                    if lang in keep_langs:
                        skipping = False
                    else:
                        skipping = True
                else:
                    # Not an exclusive lang (e.g. json, text), always keep
                    skipping = False

                if not skipping:
                    filtered_lines.append(line)

                in_block = True
            else:
                # END of block
                if not skipping:
                    filtered_lines.append(line)
                in_block = False
                skipping = False # Reset for next text
        else:
            if not skipping:
                filtered_lines.append(line)

    # Write back
    with open(filepath, 'w', encoding='utf-8') as f:
        f.writelines(filtered_lines)

def process_directory(source_dir, dest_dir, target_langs):
    # 1. Copy entire directory structure
    if os.path.exists(dest_dir):
        print(f"Destination {dest_dir} already exists. Removing it to start fresh...")
        shutil.rmtree(dest_dir)

    print(f"Copying {source_dir} to {dest_dir}...")
    shutil.copytree(source_dir, dest_dir)

    # 2. Walk through destination and filter .md files
    print(f"Filtering markdown files for languages: {target_langs}")
    count = 0
    for root, dirs, files in os.walk(dest_dir):
        for file in files:
            if file.endswith('.md'):
                filepath = os.path.join(root, file)
                filter_file_content(filepath, target_langs)
                count += 1

    print(f"Processed {count} markdown files.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Copy docs and filter code blocks by language.")
    parser.add_argument("--source", "-s", required=True, help="Source directory (e.g. docs)")
    parser.add_argument("--destination", "-d", required=True, help="Destination directory (e.g. docs_python)")
    parser.add_argument("--languages", "-l", nargs='+', required=True, help="Target languages (e.g. python ruby)")

    args = parser.parse_args()

    process_directory(args.source, args.destination, args.languages)