bigcode-dataset/preprocessing/jupyter-scripts/jupytext_conversion.py at 7870b7f950d7e48065fb2fd42b4e4dfa5930bd1a · bigcode-project/bigcode-dataset · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import json
import os
import tempfile

import jupytext
import nbformat
from datasets import load_dataset, concatenate_datasets

from guesslang import Guess


def dump_and_extract_text(notebook, file_name):
    """Convert notebook to script and save it in a temp file then exctarct text, thsi is needed for jupytext
    This might fail during conversion"""
    temp_dir = tempfile.gettempdir()
    temp_file_name = os.path.join(temp_dir, file_name)
    try:
        jupytext.write(notebook, temp_file_name)
        with open(temp_file_name, "r") as fp:
            text = fp.read()
        return True, text
    except:
        return False, ""


def get_file_name_and_extension(notebook):
    try:
        extension = notebook.metadata.language_info.file_extension
        return f"file{extension}", extension
    except (AttributeError, KeyError):
        if "py" in str(getattr(notebook.metadata, "kernelspec", "")):
            return f"file.py", ".py"
        return f"file.ipynb", ".ipynb"


def convert_nbformat3_to4(json_content):
    """Some notebooks are nbformat 3, we convert them to nbformat 4 so that we can use jupytext"""
    notebook = nbformat.from_dict(json_content)
    new_nb = nbformat.v4.convert.upgrade(notebook, from_version=3, from_minor=0)
    return new_nb


def convert_nb_script(examples):
    scripts = []
    extensions = []
    nbformats = []
    for content in examples["content"]:
        extension, text = "", ""
        nbformat3 = False
        try:
            json_content = json.loads(content)
            # Convert JSON content to nbformat notebook
            if json_content["nbformat"] == 3:
                notebook = convert_nbformat3_to4(json_content)
                nbformat3 = True
            else:
                notebook = nbformat.from_dict(json_content)
            # Join cell sources
            for cell in notebook.cells:
                cell.source = "".join(cell.source)
            file_name, extension = get_file_name_and_extension(notebook)
            _, text = dump_and_extract_text(notebook, file_name)
        except Exception as e:
            pass
        scripts.append(text)
        extensions.append(extension)
        nbformats.append(nbformat3)
    return {"script": scripts, "conversion_extension": extensions, "is_nbformat_3": nbformats}


def get_size_text(example):
    return {"size": len(example["content"])}

lang2ext = {
    "qsharp": ".py",
    "python": ".py",
    "coconut": ".coco",
    "R": ".r",
    "julia": ".jl",
    "c++": ".cpp",
    "scheme": ".scm",
    "clojure": ".clj",
    "bash": ".sh",
    "powershell": ".ps1",
    "q": ".q",
    "matlab": ".m",
    "Wolfram Language": ".wolfram",
    "idl": ".pro",
    "javascript": ".js",
    "typescript": ".ts",
    "scala": ".scala",
    "rust": ".rs",
    "robotframework": ".resource",
    "csharp": ".cs",
    "fsharp": ".fs",
    "sos": ".sos",
    "java": ".java",
    "groovy": ".groovy",
    "sage": ".sage",
    "ocaml": ".ml",
    "haskell": ".hs",
    "tcl": ".tcl",
    "maxima": ".mac",
    "gnuplot": ".gp",
}
guess = Guess()

def convert_failed_nb_script(examples):
    """Convert notebooks whose extension/language identification failed in the previous function"""
    scripts = []
    extensions = []
    for content, extension in zip(examples["content"], examples["conversion_extension"]):
        # Convert JSON content to nbformat notebook
        json_content = json.loads(content)
        if json_content["nbformat"] == 3:
            notebook = convert_nbformat3_to4(json_content)
        else:
            notebook = nbformat.from_dict(json_content)
        # Join cell sources
        for cell in notebook.cells:
            cell.source = "".join(cell.source)
        # Set notebook metadata filter
        notebook.metadata["jupytext"] = {"notebook_metadata_filter": "-all"}
        # Extract code from notebook cells
        data = "\n".join(
            [cell.source for cell in notebook.cells if cell.cell_type == "code"]
        )
        ext, text = "", ""
        if not data.isspace():
            # Guess language of the code
            for p in guess.probabilities(data):
                if p[1] >= 0.5:
                    try:
                        # Get file extension for the guessed language
                        ext = lang2ext[
                            [lang for lang in lang2ext if lang in p[0].lower()][0]
                        ]
                        # Construct file path
                        file_name = f"file{ext}"
                        # Write notebook as a script
                        _, text = dump_and_extract_text(notebook, file_name)
                        break
                    except IndexError:
                        pass
        scripts.append(text)
        extension = ext if ext else extension
        extensions.append(extension)
    return {"script": scripts, "conversion_extension": extensions}


if __name__ == "__main__":
    dataset = load_dataset("bigcode/stack_v2_notebooks_small", split="train", num_proc=22)
    print(f"Initial number of samples: {len(dataset)}")
    # multiprocessing seems to give incorrect results
    dataset = dataset.map(convert_nb_script, batched=True, batch_size=2_000)
    nbformat3 = dataset.filter(lambda x: x["is_nbformat_3"])
    print(f"Number of nbformat 3 samples: {len(nbformat3)}, percentage compared to the whole dataset: {len(nbformat3) * 100 / len(dataset)}%")

    # samples that need fixing
    valid_samples = dataset.filter(lambda x: x["conversion_extension"] != ".ipynb" and x["script"] != "")
    print(f"Number of valid converted scripts: {len(valid_samples)}, makes up: {len(valid_samples) * 100 / len(dataset)}% of teh whole dataset")
    invalid_samples = dataset.filter(lambda x: x["conversion_extension"] == ".ipynb")
    print(f"Fixing {len(invalid_samples)} invalid samples ({len(invalid_samples) * 100 / len(dataset)}%)")
    invalid_samples_fixed = invalid_samples.map(convert_failed_nb_script, batched=True, batch_size=2_000)
    # .ipynb extension means the notebook wasn't properly parsed to script
    fixed_samples = invalid_samples_fixed.filter(lambda x: x["conversion_extension"] != ".ipynb" and x["script"] != "")
    print(f"We fixed {len(fixed_samples)} samples ({len(fixed_samples) * 100 / len(dataset)}%)")

    # concatenate datasets
    final_dataset = concatenate_datasets([valid_samples, fixed_samples])
    final_dataset = final_dataset.map(get_size_text, num_proc=36)
    print(f"Final dataset size: {len(final_dataset)} files and {sum(final_dataset['size']) / 10**9}GB of text")
    final_dataset.push_to_hub("stack_v2_notebooks_small_scripts")