-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathcursor.py
More file actions
249 lines (205 loc) · 7.34 KB
/
Copy pathcursor.py
File metadata and controls
249 lines (205 loc) · 7.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# /// script
# description = "take a bunch of filepaths, filter for Python files, extract Python functions, caption them all/generate a docstring, run embeddings."
# requires-python = ">=3.12, <3.13"
# dependencies = ["daft[openai]>=0.7.10", "numpy", "python-dotenv"]
# ///
from pydantic import BaseModel
import daft
from daft import DataType
FUNCTION_SCHEMA = DataType.struct(
{
"name": DataType.string(),
"code": DataType.string(),
"docstring": DataType.string(),
"start_line": DataType.int64(),
"end_line": DataType.int64(),
"decorators": DataType.list(DataType.string()),
"signature": DataType.string(),
"body": DataType.string(),
"is_async": DataType.bool(),
}
)
CLASSES_SCHEMA = DataType.list(
DataType.struct(
{
"name": DataType.string(),
"code": DataType.string(),
"docstring": DataType.string(),
"start_line": DataType.int64(),
"end_line": DataType.int64(),
"decorators": DataType.list(DataType.string()),
"bases": DataType.list(DataType.string()),
"methods": DataType.list(FUNCTION_SCHEMA),
}
)
)
def _extract_function_metadata(node, file_content):
import ast
# Get the source code segment
code_segment = ast.get_source_segment(file_content, node)
# Get the docstring
docstring = ast.get_docstring(node)
# Get decorators
decorators = [ast.get_source_segment(file_content, d) for d in node.decorator_list]
# Get signature
signature = f"def {node.name}({ast.unparse(node.args)})"
if node.returns:
signature += f" -> {ast.unparse(node.returns)}"
# Get body
# We want the code starting from the first statement in the body
body = ""
if node.body:
start_line = node.body[0].lineno
end_line = node.end_lineno
# Split file content into lines (0-indexed list, but lineno is 1-indexed)
lines = file_content.splitlines()
# Extract lines from start_line-1 to end_line
body_lines = lines[start_line - 1 : end_line]
body = "\n".join(body_lines)
return {
"name": node.name,
"code": code_segment,
"docstring": docstring,
"start_line": node.lineno,
"end_line": node.end_lineno,
"decorators": decorators,
"signature": signature,
"body": body,
"is_async": isinstance(node, ast.AsyncFunctionDef),
}
@daft.func(return_dtype=CLASSES_SCHEMA, on_error="log")
def extract_classes(
file: daft.File,
):
"""retrieve all classes (with their methods) from the file"""
import ast
with file.open() as f:
file_content = f.read().decode("utf-8")
tree = ast.parse(file_content)
results = []
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
# Get the source code segment
code_segment = ast.get_source_segment(file_content, node)
# Get the docstring
docstring = ast.get_docstring(node)
# Get decorators
decorators = [ast.get_source_segment(file_content, d) for d in node.decorator_list]
# Get bases
bases = [ast.unparse(b) for b in node.bases]
# Get methods
methods = []
for child in node.body:
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
methods.append(_extract_function_metadata(child, file_content))
results.append(
{
"name": node.name,
"code": code_segment,
"docstring": docstring,
"start_line": node.lineno,
"end_line": node.end_lineno,
"decorators": decorators,
"bases": bases,
"methods": methods,
}
)
return results
@daft.func(return_dtype=DataType.list(FUNCTION_SCHEMA), on_error="log")
def extract_functions(file: daft.File):
"""retrieve all functions from the file"""
import ast
with file.open() as f:
file_content = f.read().decode("utf-8")
tree = ast.parse(file_content)
results = []
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
results.append(_extract_function_metadata(node, file_content))
return results
if __name__ == "__main__":
from dotenv import load_dotenv
from daft import col, lit
from daft.functions import embed_text, file, prompt, unnest
load_dotenv()
class Caption(BaseModel):
source_file: str
code: str
docstring: str
code_type: str
repo = "../../**/*.py"
df = daft.from_glob_path(repo).with_column("file", file(col("path")))
classes_df = (
df.with_column("classes", extract_classes(col("file")))
.explode("classes")
.select("path", unnest(col("classes")))
)
methods_df = (
classes_df.select(col("path"), col("name").alias("class_name"), col("methods"))
.explode("methods")
.select(
col("path"),
col("class_name"),
col("methods").get("name").alias("name"),
col("methods").get("signature"),
col("methods").get("docstring"),
col("methods").get("body"),
)
)
# Caption and Embed Methods
methods_df = (
methods_df.with_column(
"prompt_input",
lit("Explain what this Python method does in one concise sentence. Focus on the purpose and logic.\n\n")
+ lit("Method: ")
+ col("name").fill_null("")
+ lit("\nSignature: ")
+ col("signature").fill_null("")
+ lit("\nDocstring: ")
+ col("docstring").fill_null("")
+ lit("\nCode:\n")
+ col("body").fill_null(""),
)
.with_column(
"caption",
prompt(col("prompt_input"), model="gpt-4o-mini", provider="openai"),
)
.with_column(
"embedding",
embed_text(col("caption"), model="text-embedding-3-small", provider="openai"),
)
)
functions_df = (
df.with_column("functions", extract_functions(col("file")))
.explode("functions")
.select("path", unnest(col("functions")))
)
# Caption and Embed Functions
functions_df = (
functions_df.with_column(
"prompt_input",
lit("Explain what this Python function does in one concise sentence. Focus on the purpose and logic.\n\n")
+ lit("Function: ")
+ col("name").fill_null("")
+ lit("\nSignature: ")
+ col("signature").fill_null("")
+ lit("\nDocstring: ")
+ col("docstring").fill_null("")
+ lit("\nCode:\n")
+ col("body").fill_null(""),
)
.with_column(
"caption",
prompt(col("prompt_input"), model="gpt-4o-mini", provider="openai"),
)
.with_column(
"embedding",
embed_text(col("caption"), model="text-embedding-3-small", provider="openai"),
)
)
print("Classes:")
classes_df.show()
print("Methods (with Captions and Embeddings):")
methods_df.show()
print("Functions (with Captions and Embeddings):")
functions_df.show()