|
11 | 11 | "import pandas as pd\n", |
12 | 12 | "from collections import defaultdict\n", |
13 | 13 | "\n", |
14 | | - "pd.set_option('display.max_colwidth', None)\n", |
| 14 | + "pd.set_option(\"display.max_colwidth\", None)\n", |
15 | 15 | "pd.set_option(\"display.max_rows\", 1000)\n", |
16 | | - "pd.set_option('display.width', None)" |
| 16 | + "pd.set_option(\"display.width\", None)" |
17 | 17 | ] |
18 | 18 | }, |
19 | 19 | { |
|
60 | 60 | " warn = None\n", |
61 | 61 | " if \"ERROR\" in line:\n", |
62 | 62 | " error = line.split(\"\\t\")[-1]\n", |
63 | | - " \n", |
| 63 | + "\n", |
64 | 64 | " if \"WARN\" in line:\n", |
65 | 65 | " warn = line.split(\"\\t\")[-1]\n", |
66 | | - " \n", |
67 | | - " if error or warn: \n", |
| 66 | + "\n", |
| 67 | + " if error or warn:\n", |
68 | 68 | " data[\"file\"].append(file.name)\n", |
69 | 69 | " data[\"error\"].append(error)\n", |
70 | 70 | " data[\"warning\"].append(warn)" |
|
505 | 505 | ], |
506 | 506 | "source": [ |
507 | 507 | "# how many files could not be processed in %?\n", |
508 | | - "len(df[~df.error.isna()]) / len(files) * 100\n", |
509 | | - " " |
| 508 | + "len(df[~df.error.isna()]) / len(files) * 100" |
510 | 509 | ] |
511 | 510 | }, |
512 | 511 | { |
|
597 | 596 | ], |
598 | 597 | "source": [ |
599 | 598 | "# month list is empty?\n", |
600 | | - "print(\"\\n\".join(l.split(\".\")[0] for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()))" |
| 599 | + "print(\n", |
| 600 | + " \"\\n\".join(\n", |
| 601 | + " l.split(\".\")[0]\n", |
| 602 | + " for l in df[(~df.error.isna()) & (df.error.str.contains(\"month_list\"))].file.tolist()\n", |
| 603 | + " )\n", |
| 604 | + ")" |
601 | 605 | ] |
602 | 606 | }, |
603 | 607 | { |
|
627 | 631 | ], |
628 | 632 | "source": [ |
629 | 633 | "# cannot find patient data\n", |
630 | | - "for year in range(2017,2023):\n", |
| 634 | + "for year in range(2017, 2023):\n", |
631 | 635 | " subdf = df[(~df.error.isna()) & (df.error.str.contains(\"readxl::cell_limits\"))]\n", |
632 | 636 | " print(subdf[subdf.file.str.startswith(str(year))].file.tolist())" |
633 | 637 | ] |
|
854 | 858 | } |
855 | 859 | ], |
856 | 860 | "source": [ |
857 | | - "df[~df.warning.isna()].drop_duplicates()\n" |
| 861 | + "df[~df.warning.isna()].drop_duplicates()" |
858 | 862 | ] |
859 | 863 | }, |
860 | 864 | { |
|
925 | 929 | } |
926 | 930 | ], |
927 | 931 | "source": [ |
928 | | - "missed_names = df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))].warning.str.strip(\"Extra columns in patient data:\").drop_duplicates().to_list()\n", |
| 932 | + "missed_names = (\n", |
| 933 | + " df[(~df.warning.isna()) & (df.warning.str.contains(\"Extra\"))]\n", |
| 934 | + " .warning.str.strip(\"Extra columns in patient data:\")\n", |
| 935 | + " .drop_duplicates()\n", |
| 936 | + " .to_list()\n", |
| 937 | + ")\n", |
929 | 938 | "\n", |
930 | 939 | "names = set()\n", |
931 | 940 | "for x in missed_names:\n", |
932 | 941 | " names.update(x for x in x.split(\",\") if x)\n", |
933 | | - " \n", |
| 942 | + "\n", |
934 | 943 | "names" |
935 | 944 | ] |
936 | 945 | }, |
|
1877 | 1886 | "source": [ |
1878 | 1887 | "subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"Found invalid value\", regex=False))]\n", |
1879 | 1888 | "\n", |
1880 | | - "subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0,3]].drop_duplicates()" |
| 1889 | + "subdf.warning.str.strip(\"Found invalid value \").str.split(expand=True)[[0, 3]].drop_duplicates()" |
1881 | 1890 | ] |
1882 | 1891 | }, |
1883 | 1892 | { |
|
3192 | 3201 | } |
3193 | 3202 | ], |
3194 | 3203 | "source": [ |
3195 | | - "subdf = df[(~df.warning.isna()) & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))]\n", |
| 3204 | + "subdf = df[\n", |
| 3205 | + " (~df.warning.isna())\n", |
| 3206 | + " & (df.warning.str.contains(\"not in the list of allowed values\", regex=False))\n", |
| 3207 | + "]\n", |
3196 | 3208 | "\n", |
3197 | 3209 | "subdf.warning.str.split(expand=True)[[3, 6]].drop_duplicates()" |
3198 | 3210 | ] |
|
0 commit comments