Skip to content

Commit e79f7f4

Browse files
committed
Added unlabeled data to WILDS datasets
1 parent 3da4b35 commit e79f7f4

90 files changed

Lines changed: 9302 additions & 787 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
__pycache__
2+
.idea
23
build
4+
data
5+
logs
36
dist
47
venv
58
wilds.egg-info
9+
.DS_Store

README.md

Lines changed: 205 additions & 116 deletions
Large diffs are not rendered by default.
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import argparse
2+
import csv
3+
import os
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
# Fix the seed for reproducibility
9+
np.random.seed(0)
10+
11+
"""
12+
Create unlabeled splits for Amazon.
13+
14+
Usage:
15+
python dataset_preprocessing/amazon_yelp/create_unlabeled_amazon.py <path>
16+
"""
17+
18+
NOT_IN_DATASET = -1
19+
20+
# Splits
21+
# 'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4,
22+
# 'val_unlabeled': 11, 'test_unlabeled': 12, 'extra_unlabeled': 13
23+
(
24+
TRAIN,
25+
OOD_VAL,
26+
ID_VAL,
27+
OOD_TEST,
28+
ID_TEST,
29+
) = range(5)
30+
VAL_UNLABELED, TEST_UNLABELED, EXTRA_UNLABELED = range(11, 14)
31+
32+
33+
def main(dataset_path):
34+
def output_split_sizes():
35+
print("-" * 50)
36+
print(f'Train size: {len(split_df[split_df["split"] == TRAIN])}')
37+
print(f'Val size: {len(split_df[split_df["split"] == OOD_VAL])}')
38+
print(f'ID Val size: {len(split_df[split_df["split"] == ID_VAL])}')
39+
print(f'Test size: {len(split_df[split_df["split"] == OOD_TEST])}')
40+
print(f'ID Test size: {len(split_df[split_df["split"] == ID_TEST])}')
41+
print(
42+
f'OOD Val Unlabeled size: {len(split_df[split_df["split"] == VAL_UNLABELED])}'
43+
)
44+
print(
45+
f'OOD Test Unlabeled size: {len(split_df[split_df["split"] == TEST_UNLABELED])}'
46+
)
47+
print(
48+
f'Extra Unlabeled size: {len(split_df[split_df["split"] == EXTRA_UNLABELED])}'
49+
)
50+
print(
51+
f'Number of examples not included: {len(split_df[split_df["split"] == NOT_IN_DATASET])}'
52+
)
53+
print(f'Number of unclean reviews: {len(split_df[~split_df["clean"]])}')
54+
print("-" * 50)
55+
print("\n")
56+
57+
def set_unlabeled_split(split, reviewers):
58+
# Get unused reviews written by users from `reviewers`
59+
split_df.loc[
60+
(split_df["split"] == NOT_IN_DATASET)
61+
& split_df["clean"]
62+
& data_df["reviewerID"].isin(reviewers),
63+
"split",
64+
] = split
65+
66+
def validate_split(split, expected_reviewers_count):
67+
# Sanity check:
68+
# Ensure the number of reviewers equals the number of reviewers in its unlabeled counterpart
69+
# and each reviewer has at least 75 reviews.
70+
actual_reviewers_counts = (
71+
data_df[(split_df["split"] == split)]["reviewerID"].unique().size
72+
)
73+
assert (
74+
actual_reviewers_counts == expected_reviewers_count
75+
), "The number of reviewers ({}) did not equal {}".format(
76+
actual_reviewers_counts, expected_reviewers_count
77+
)
78+
min_reviewers_count = (
79+
data_df[(split_df["split"] == split)]["reviewerID"].value_counts().min()
80+
)
81+
assert (
82+
min_reviewers_count >= 75
83+
), "Each reviewer should have at least 75 reviews, but got a minimum of {} reviews.".format(
84+
min_reviewers_count
85+
)
86+
87+
data_df = pd.read_csv(
88+
os.path.join(dataset_path, "reviews.csv"),
89+
dtype={
90+
"reviewerID": str,
91+
"asin": str,
92+
"reviewTime": str,
93+
"unixReviewTime": int,
94+
"reviewText": str,
95+
"summary": str,
96+
"verified": bool,
97+
"category": str,
98+
"reviewYear": int,
99+
},
100+
keep_default_na=False,
101+
na_values=[],
102+
quoting=csv.QUOTE_NONNUMERIC,
103+
)
104+
user_csv_path = os.path.join(dataset_path, "splits", "user.csv")
105+
split_df = pd.read_csv(user_csv_path)
106+
assert split_df.shape[0] == data_df.shape[0]
107+
output_split_sizes()
108+
109+
ood_val_reviewers_ids = data_df[
110+
split_df["split"] == OOD_VAL
111+
].reviewerID.unique() # 1334 users
112+
set_unlabeled_split(VAL_UNLABELED, ood_val_reviewers_ids)
113+
114+
ood_test_reviewers_ids = data_df[
115+
split_df["split"] == OOD_TEST
116+
].reviewerID.unique() # 1334 users
117+
set_unlabeled_split(TEST_UNLABELED, ood_test_reviewers_ids)
118+
119+
# For EXTRA_UNLABELED, use any users not in any of the other splits
120+
existing_reviewer_ids = np.concatenate(
121+
[
122+
ood_test_reviewers_ids,
123+
ood_val_reviewers_ids,
124+
data_df[split_df["split"] == TRAIN].reviewerID.unique(),
125+
data_df[split_df["split"] == ID_VAL].reviewerID.unique(),
126+
data_df[split_df["split"] == ID_TEST].reviewerID.unique(),
127+
]
128+
)
129+
# There are 151,736 extra reviewers
130+
extra_reviewers_ids = data_df[
131+
~data_df.reviewerID.isin(existing_reviewer_ids)
132+
].reviewerID.unique()
133+
set_unlabeled_split(EXTRA_UNLABELED, extra_reviewers_ids)
134+
135+
# Exclude reviewers with less than 75 reviews.
136+
review_counts = data_df[(split_df["split"] == EXTRA_UNLABELED)][
137+
"reviewerID"
138+
].value_counts()
139+
reviewers_to_filter_out = review_counts[review_counts < 75].keys()
140+
split_df.loc[
141+
(split_df["split"] == EXTRA_UNLABELED)
142+
& data_df["reviewerID"].isin(reviewers_to_filter_out),
143+
"split",
144+
] = NOT_IN_DATASET
145+
146+
# We are done splitting, output stats.
147+
output_split_sizes()
148+
149+
# Sanity checks
150+
validate_split(VAL_UNLABELED, ood_val_reviewers_ids.size)
151+
validate_split(TEST_UNLABELED, ood_test_reviewers_ids.size)
152+
# After filtering out unclean reviews and ensuring >= 75 reviews per reviewer, we are left with 21,694 reviewers.
153+
validate_split(EXTRA_UNLABELED, 21694)
154+
155+
# Write out the new unlabeled split to user.csv
156+
split_df.to_csv(user_csv_path, index=False)
157+
print("Done.")
158+
159+
160+
if __name__ == "__main__":
161+
parser = argparse.ArgumentParser(description="Create unlabeled splits for Amazon.")
162+
parser.add_argument(
163+
"path",
164+
type=str,
165+
help="Path to the Amazon dataset",
166+
)
167+
args = parser.parse_args()
168+
main(args.path)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
## Unlabeled Camelyon17-WILDS patch processing
2+
3+
#### Requirements
4+
5+
- openslide-python>=1.1.2
6+
- opencv-python>=4.4.0
7+
8+
openslide-python relies on first installing OpenSlide;
9+
see [installation instructions](https://github.com/openslide/openslide-python).
10+
11+
#### Instructions
12+
13+
1. Download the [CAMELYON17 training data](https://drive.google.com/drive/folders/0BzsdkU4jWx9BSEI2X1VOLUpYZ3c?resourcekey=0-41XIPJNyEAo598wHxVAP9w)
14+
into `SLIDE_ROOT`.
15+
16+
2. Run `python generate_all_patch_coords.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to generate a .csv of all
17+
potential patches as well as the tissue masks for each WSI. `OUTPUT_ROOT` is wherever you would like the
18+
patches to eventually be written.
19+
20+
3. Then run `python generate_final_metadata.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT`
21+
to generate the metadata.csv file for unlabeled Camelyon.
22+
23+
4. Finally, run `python extract_final_patches_to_disk.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to
24+
extract the chosen patches from the WSIs and write them to disk.
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import argparse
2+
import os
3+
import pdb
4+
from tqdm import tqdm
5+
6+
import openslide
7+
import pandas as pd
8+
9+
from generate_all_patch_coords import PATCH_LEVEL, CENTER_SIZE
10+
11+
12+
def write_patch_images_from_df(slide_root, output_root):
13+
print("Writing patch images to disk...")
14+
read_df = pd.read_csv(
15+
os.path.join(output_root, "metadata.csv"), index_col=0, dtype={"patient": "str"}
16+
)
17+
18+
patch_level = PATCH_LEVEL
19+
center_size = CENTER_SIZE
20+
patch_size = center_size * 3
21+
22+
for idx in tqdm(read_df.index):
23+
orig_x = read_df.loc[idx, "x_coord"]
24+
orig_y = read_df.loc[idx, "y_coord"]
25+
center = read_df.loc[idx, "center"]
26+
patient = read_df.loc[idx, "patient"]
27+
node = read_df.loc[idx, "node"]
28+
29+
patch_folder = os.path.join(
30+
output_root, "patches", f"patient_{patient}_node_{node}"
31+
)
32+
patch_path = os.path.join(
33+
patch_folder,
34+
f"patch_patient_{patient}_node_{node}_x_{orig_x}_y_{orig_y}.png",
35+
)
36+
37+
os.makedirs(patch_folder, exist_ok=True)
38+
if os.path.isfile(patch_path):
39+
continue
40+
41+
slide_path = os.path.join(
42+
slide_root,
43+
f"center_{center}",
44+
f"patient_{patient}",
45+
f"patient_{patient}_node_{node}.tif",
46+
)
47+
slide = openslide.OpenSlide(slide_path)
48+
49+
# Coords are at patch_level
50+
# First shift coords to top left corner of the entire patch
51+
x = orig_x - center_size
52+
y = orig_y - center_size
53+
# Then match to level 0 coords so we can use read_region
54+
x = int(
55+
round(
56+
x
57+
* slide.level_dimensions[0][0]
58+
/ slide.level_dimensions[patch_level][0]
59+
)
60+
)
61+
y = int(
62+
round(
63+
y
64+
* slide.level_dimensions[0][1]
65+
/ slide.level_dimensions[patch_level][1]
66+
)
67+
)
68+
69+
patch = slide.read_region((x, y), 2, (patch_size, patch_size))
70+
patch.save(patch_path)
71+
print("Done.")
72+
73+
74+
if __name__ == "__main__":
75+
parser = argparse.ArgumentParser()
76+
parser.add_argument("--slide_root", required=True)
77+
parser.add_argument("--output_root", required=True)
78+
args = parser.parse_args()
79+
write_patch_images_from_df(slide_root=args.slide_root, output_root=args.output_root)

0 commit comments

Comments
 (0)