Skip to content

Commit aa2e5bd

Browse files
committed
updates the data preparation data
1 parent ec3ca5c commit aa2e5bd

3 files changed

Lines changed: 133 additions & 7 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,7 @@ venv/
1717
Thumbs.db
1818
__pycache__/
1919
*.pyc
20+
21+
# Except these files
22+
. 1_datasets\processed\tedsd_puf_2023_cleaned.csv
23+
. 1_datasets\raw\tedsd_puf_2023.csv

2_data_preparation/cleaning_teds_d.ipynb

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
},
2121
{
2222
"cell_type": "code",
23-
"execution_count": null,
23+
"execution_count": 31,
2424
"id": "596bf3e0",
2525
"metadata": {},
2626
"outputs": [],
@@ -29,6 +29,8 @@
2929
"import numpy as np\n",
3030
"from tqdm import tqdm\n",
3131
"import warnings\n",
32+
"import os\n",
33+
"os.getcwd()\n",
3234
"\n",
3335
"warnings.filterwarnings(\"ignore\")"
3436
]
@@ -876,15 +878,18 @@
876878
{
877879
"cell_type": "code",
878880
"execution_count": null,
879-
"id": "9ba487dc",
881+
"id": "dfbecb3a",
880882
"metadata": {},
881883
"outputs": [],
882884
"source": [
885+
"from pathlib import Path\n",
886+
"import os\n",
887+
"\n",
883888
"if __name__ == \"__main__\":\n",
884889
" INPUT_FILE = \"1_datasets/raw/tedsd_puf_2023.csv\"\n",
885890
" OUTPUT_FILE = \"1_datasets/processed/teds_d_2023_cleaned.csv\"\n",
886891
"\n",
887-
" clean_teds_d_data(INPUT_FILE, OUTPUT_FILE)"
892+
" clean_teds_d_data(INPUT_FILE, OUTPUT_FILE)\n"
888893
]
889894
},
890895
{
@@ -899,21 +904,21 @@
899904
{
900905
"cell_type": "code",
901906
"execution_count": null,
902-
"id": "b16a295c",
907+
"id": "76f5cdd5",
903908
"metadata": {},
904909
"outputs": [],
905910
"source": [
906911
"df = pd.read_csv(\"1_datasets/processed/teds_d_2023_cleaned.csv\")\n",
907912
"\n",
908913
"df_sample = df.sample(1000, random_state=42)\n",
909914
"\n",
910-
"df_sample.to_csv(\"1_datasets/sample/tedsd_sample.csv\", index=False)"
915+
"df_sample.to_csv(\"1_datasets/sample/tedsd_sample.csv\", index=False)\n"
911916
]
912917
}
913918
],
914919
"metadata": {
915920
"kernelspec": {
916-
"display_name": "Python 3",
921+
"display_name": ".venv",
917922
"language": "python",
918923
"name": "python3"
919924
},
@@ -927,7 +932,7 @@
927932
"name": "python",
928933
"nbconvert_exporter": "python",
929934
"pygments_lexer": "ipython3",
930-
"version": "3.12.10"
935+
"version": "3.14.0"
931936
}
932937
},
933938
"nbformat": 4,

requirements.txt

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
anyio==4.11.0
2+
argon2-cffi==25.1.0
3+
argon2-cffi-bindings==25.1.0
4+
arrow==1.4.0
5+
astroid==4.0.1
6+
asttokens==3.0.0
7+
async-lru==2.0.5
8+
attrs==25.4.0
9+
babel==2.17.0
10+
beautifulsoup4==4.14.2
11+
bleach==6.3.0
12+
certifi==2025.10.5
13+
cffi==2.0.0
14+
charset-normalizer==3.4.4
15+
colorama==0.4.6
16+
comm==0.2.3
17+
contourpy==1.3.3
18+
cycler==0.12.1
19+
debugpy==1.8.17
20+
decorator==5.2.1
21+
defusedxml==0.7.1
22+
dill==0.4.0
23+
executing==2.2.1
24+
fastjsonschema==2.21.2
25+
fonttools==4.60.1
26+
fqdn==1.5.1
27+
h11==0.16.0
28+
httpcore==1.0.9
29+
httpx==0.28.1
30+
idna==3.11
31+
ipykernel==7.1.0
32+
ipython==9.7.0
33+
ipython_pygments_lexers==1.1.1
34+
ipywidgets==8.1.8
35+
isoduration==20.11.0
36+
isort==7.0.0
37+
jedi==0.19.2
38+
Jinja2==3.1.6
39+
json5==0.12.1
40+
jsonpointer==3.0.0
41+
jsonschema==4.25.1
42+
jsonschema-specifications==2025.9.1
43+
jupyter==1.1.1
44+
jupyter-console==6.6.3
45+
jupyter-events==0.12.0
46+
jupyter-lsp==2.3.0
47+
jupyter_client==8.6.3
48+
jupyter_core==5.9.1
49+
jupyter_server==2.17.0
50+
jupyter_server_terminals==0.5.3
51+
jupyterlab==4.4.10
52+
jupyterlab_pygments==0.3.0
53+
jupyterlab_server==2.28.0
54+
jupyterlab_widgets==3.0.16
55+
kiwisolver==1.4.9
56+
lark==1.3.1
57+
MarkupSafe==3.0.3
58+
matplotlib==3.10.7
59+
matplotlib-inline==0.2.1
60+
mccabe==0.7.0
61+
mistune==3.1.4
62+
nbclient==0.10.2
63+
nbconvert==7.16.6
64+
nbformat==5.10.4
65+
nest-asyncio==1.6.0
66+
nodeenv==1.9.1
67+
notebook==7.4.7
68+
notebook_shim==0.2.4
69+
numpy==2.3.4
70+
packaging==25.0
71+
pandas==2.3.3
72+
pandocfilters==1.5.1
73+
parso==0.8.5
74+
pillow==12.0.0
75+
platformdirs==4.5.0
76+
prometheus_client==0.23.1
77+
prompt_toolkit==3.0.52
78+
psutil==7.1.3
79+
pure_eval==0.2.3
80+
pycparser==2.23
81+
Pygments==2.19.2
82+
pylint==4.0.2
83+
pyparsing==3.2.5
84+
pyright==1.1.407
85+
python-dateutil==2.9.0.post0
86+
python-json-logger==4.0.0
87+
pytz==2025.2
88+
pywinpty==3.0.2
89+
PyYAML==6.0.3
90+
pyzmq==27.1.0
91+
referencing==0.37.0
92+
requests==2.32.5
93+
rfc3339-validator==0.1.4
94+
rfc3986-validator==0.1.1
95+
rfc3987-syntax==1.1.0
96+
rpds-py==0.28.0
97+
seaborn==0.13.2
98+
Send2Trash==1.8.3
99+
setuptools==80.9.0
100+
six==1.17.0
101+
sniffio==1.3.1
102+
soupsieve==2.8
103+
stack-data==0.6.3
104+
terminado==0.18.1
105+
tinycss2==1.4.0
106+
tomlkit==0.13.3
107+
tornado==6.5.2
108+
traitlets==5.14.3
109+
typing_extensions==4.15.0
110+
tzdata==2025.2
111+
uri-template==1.3.0
112+
urllib3==2.5.0
113+
wcwidth==0.2.14
114+
webcolors==25.10.0
115+
webencodings==0.5.1
116+
websocket-client==1.9.0
117+
widgetsnbextension==4.0.15

0 commit comments

Comments
 (0)