Merge pull request #123 from scaleapi/jihan/ann-api

ardila · web-flow · commit 019817a4b555 · 2021-09-21T13:33:38.000-07:00
Nucleus - export gt and preds
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -385,6 +385,21 @@ def loc(self, dataset_item_id: str) -> dict:
         response = self._client.dataitem_loc(self.id, dataset_item_id)
         return format_dataset_item_response(response)
 
+    def ground_truth_loc(self, reference_id: str, annotation_id: str):
+        """
+        Returns info for single ground truth Annotation by its id.
+        :param reference_id: User specified id for the dataset item the ground truth is attached to
+        :param annotation_id: User specified, or auto-generated id for the annotation
+        :return:
+        BoxAnnotation | PolygonAnnotation | CuboidAnnotation
+        """
+        response = self._client.make_request(
+            {},
+            f"dataset/{self.id}/groundTruth/loc/{reference_id}/{annotation_id}",
+            requests.get,
+        )
+        return Annotation.from_json(response)
+
     def create_slice(
         self,
         name: str,
diff --git a/nucleus/job.py b/nucleus/job.py
@@ -40,13 +40,13 @@ def errors(self) -> List[str]:
     def sleep_until_complete(self, verbose_std_out=True):
         while 1:
             status = self.status()
-
             time.sleep(JOB_POLLING_INTERVAL)
 
             if verbose_std_out:
                 print(f"Status at {time.ctime()}: {status}")
             if status["status"] == "Running":
                 continue
+
             break
 
         final_status = status
diff --git a/nucleus/model_run.py b/nucleus/model_run.py
@@ -19,6 +19,7 @@
     CuboidPrediction,
     PolygonPrediction,
     SegmentationPrediction,
+    from_json,
 )
 
 
@@ -160,6 +161,23 @@ def loc(self, dataset_item_id: str):
         )
         return self._format_prediction_response(response)
 
+    def prediction_loc(self, reference_id: str, annotation_id: str):
+        """
+        Returns info for single Prediction by its reference id and annotation id.
+        :param reference_id: the user specified id for the image
+        :param annotation_id: the user specified id for the prediction, or if one was not provided, the Scale internally generated id for the prediction
+        :return:
+         BoxPrediction | PolygonPrediction | CuboidPrediction
+        """
+
+        response = self._client.make_request(
+            {},
+            f"modelRun/{self.model_run_id}/prediction/loc/{reference_id}/{annotation_id}",
+            requests.get,
+        )
+
+        return from_json(response)
+
     def ungrouped_export(self):
         json_response = self._client.make_request(
             payload={},
diff --git a/nucleus/prediction.py b/nucleus/prediction.py
@@ -10,10 +10,14 @@
 )
 from .constants import (
     ANNOTATION_ID_KEY,
+    BOX_TYPE,
+    CUBOID_TYPE,
+    POLYGON_TYPE,
     REFERENCE_ID_KEY,
     METADATA_KEY,
     GEOMETRY_KEY,
     LABEL_KEY,
+    TYPE_KEY,
     X_KEY,
     Y_KEY,
     WIDTH_KEY,
@@ -29,6 +33,17 @@
 )
 
 
+def from_json(payload: dict):
+    if payload.get(TYPE_KEY, None) == BOX_TYPE:
+        return BoxPrediction.from_json(payload)
+    elif payload.get(TYPE_KEY, None) == POLYGON_TYPE:
+        return PolygonPrediction.from_json(payload)
+    elif payload.get(TYPE_KEY, None) == CUBOID_TYPE:
+        return CuboidPrediction.from_json(payload)
+    else:
+        return SegmentationPrediction.from_json(payload)
+
+
 class SegmentationPrediction(SegmentationAnnotation):
     # No need to define init or to_payload methods because
     # we default to functions defined in the parent class
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.21"
+version = "0.1.22"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/scripts/load_test.py b/scripts/load_test.py
@@ -4,6 +4,8 @@
 import nucleus
 import os
 
+from itertools import zip_longest
+
 import time
 
 
@@ -21,6 +23,8 @@
     "API Key to use. Defaults to NUCLEUS_PYTEST_API_KEY environment variable",
 )
 
+flags.DEFINE_integer("job_parallelism", 8, "Amount of concurrent jobs to use.")
+
 # Dataset upload flags
 flags.DEFINE_enum(
     "create_or_reuse_dataset",
@@ -35,12 +39,12 @@
 )
 flags.DEFINE_integer(
     "num_dataset_items",
-    100000,
+    10000000,
     "Number of dataset items to create if creating a dataset",
     lower_bound=0,
 )
 flags.DEFINE_bool(
-    "cleanup_dataset", True, "Whether to delete the dataset after the test."
+    "cleanup_dataset", False, "Whether to delete the dataset after the test."
 )
 
 # Annotation upload flags
@@ -54,11 +58,21 @@
 # Prediction upload flags
 flags.DEFINE_integer(
     "num_predictions_per_dataset_item",
-    0,
+    1,
     "Number of annotations per dataset item",
     lower_bound=0,
 )
 
+TIMINGS = {}
+
+
+def chunk(iterable, chunk_size, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    args = [iter(iterable)] * chunk_size
+
+    for chunk_iterable in zip_longest(*args, fillvalue=fillvalue):
+        yield filter(lambda x: x is not None, chunk_iterable)
+
 
 def client():
     return nucleus.NucleusClient(api_key=FLAGS.api_key)
@@ -126,15 +140,23 @@ def create_or_get_dataset():
         dataset = client().create_dataset("Privacy Mode Load Test Dataset")
         print("Starting dataset item upload")
         tic = time.time()
-        job = dataset.append(
-            dataset_item_generator(), update=True, asynchronous=True
-        )
-        try:
-            job.sleep_until_complete(False)
-        except JobError:
-            print(job.errors())
+        chunk_size = FLAGS.num_dataset_items // FLAGS.job_parallelism
+        jobs = []
+        for dataset_item_chunk in chunk(dataset_item_generator(), chunk_size):
+            jobs.append(
+                dataset.append(
+                    dataset_item_chunk, update=True, asynchronous=True
+                )
+            )
+
+        for job in jobs:
+            try:
+                job.sleep_until_complete(False)
+            except JobError:
+                print(job.errors())
         toc = time.time()
         print("Finished dataset item upload: %s" % (toc - tic))
+        TIMINGS[f"Dataset Item Upload {FLAGS.num_dataset_items}"] = toc - tic
     else:
         print(f"Reusing dataset {FLAGS.dataset_id}")
         dataset = client().get_dataset(FLAGS.dataset_id)
@@ -144,15 +166,26 @@ def create_or_get_dataset():
 def upload_annotations(dataset: Dataset):
     print("Starting annotation upload")
     tic = time.time()
-    job = dataset.annotate(
-        list(annotation_generator()), update=False, asynchronous=True
+    jobs = []
+    num_annotations = (
+        FLAGS.num_dataset_items * FLAGS.num_annotations_per_dataset_item
     )
-    try:
-        job.sleep_until_complete(False)
-    except JobError:
-        print(job.errors())
+    chunk_size = num_annotations // FLAGS.job_parallelism
+    for annotation_chunk in chunk(annotation_generator(), chunk_size):
+        jobs.append(
+            dataset.annotate(
+                list(annotation_chunk), update=False, asynchronous=True
+            )
+        )
+
+    for job in jobs:
+        try:
+            job.sleep_until_complete(False)
+        except JobError:
+            print(job.errors())
     toc = time.time()
     print("Finished annotation upload: %s" % (toc - tic))
+    TIMINGS[f"Annotation Upload {num_annotations}"] = toc - tic
 
 
 def upload_predictions(dataset: Dataset):
@@ -167,16 +200,24 @@ def upload_predictions(dataset: Dataset):
 
     print("Starting prediction upload")
 
-    job = run.predict(
-        list(prediction_generator()), update=True, asynchronous=True
+    num_predictions = (
+        FLAGS.num_dataset_items * FLAGS.num_predictions_per_dataset_item
     )
+    chunk_size = num_predictions // FLAGS.job_parallelism
+    jobs = []
+    for prediction_chunk in chunk(prediction_generator(), chunk_size):
+        jobs.append(
+            run.predict(list(prediction_chunk), update=True, asynchronous=True)
+        )
 
-    try:
-        job.sleep_until_complete(False)
-    except JobError:
-        print(job.errors())
+    for job in jobs:
+        try:
+            job.sleep_until_complete(False)
+        except JobError:
+            print(job.errors())
     toc = time.time()
     print("Finished prediction upload: %s" % (toc - tic))
+    TIMINGS[f"Prediction Upload {num_predictions}"] = toc - tic
 
 
 def main(unused_argv):
@@ -194,6 +235,8 @@ def main(unused_argv):
     if FLAGS.cleanup_dataset and FLAGS.create_or_reuse_dataset == "create":
         client().delete_dataset(dataset.id)
 
+    print(TIMINGS)
+
 
 if __name__ == "__main__":
     app.run(main)
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
@@ -70,6 +70,11 @@ def test_box_gt_upload(dataset):
     assert response["annotations_ignored"] == 0
 
     response = dataset.refloc(annotation.reference_id)["annotations"]["box"]
+    single_annotation_response = dataset.ground_truth_loc(
+        annotation.reference_id, annotation.annotation_id
+    )
+
+    assert response[0] == single_annotation_response
     assert len(response) == 1
     response_annotation = response[0]
     assert_box_annotation_matches_dict(
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -332,7 +332,7 @@ def test_raises_error_for_duplicate():
 def test_dataset_export_autotag_tagged_items(CLIENT):
     # This test can only run for the test user who has an indexed dataset.
     # TODO: if/when we can create autotags via api, create one instead.
-    if NUCLEUS_PYTEST_USER_ID in os.environ["NUCLEUS_PYTEST_USER_ID"]:
+    if os.environ.get("NUCLEUS_PYTEST_USER_ID") == NUCLEUS_PYTEST_USER_ID:
         dataset = CLIENT.get_dataset(DATASET_WITH_AUTOTAG)
 
         with pytest.raises(NucleusAPIError) as api_error:
@@ -362,7 +362,7 @@ def test_dataset_export_autotag_tagged_items(CLIENT):
 def test_dataset_export_autotag_training_items(CLIENT):
     # This test can only run for the test user who has an indexed dataset.
     # TODO: if/when we can create autotags via api, create one instead.
-    if NUCLEUS_PYTEST_USER_ID in os.environ["NUCLEUS_PYTEST_USER_ID"]:
+    if os.environ.get("NUCLEUS_PYTEST_USER_ID") == NUCLEUS_PYTEST_USER_ID:
         dataset = CLIENT.get_dataset(DATASET_WITH_AUTOTAG)
 
         with pytest.raises(NucleusAPIError) as api_error:
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
@@ -1,4 +1,5 @@
 from nucleus.job import AsyncJob
+import os
 import pytest
 import time
 from .helpers import (
@@ -20,14 +21,12 @@
     PolygonPrediction,
     SegmentationPrediction,
     DatasetItem,
-    Segment,
     ModelRun,
+    Segment,
     Point,
 )
 from nucleus.constants import ERROR_PAYLOAD
 
-from nucleus import utils
-
 
 def test_reprs():
     def test_repr(test_object: any):
@@ -85,6 +84,10 @@ def test_box_pred_upload(model_run):
     assert response["predictions_ignored"] == 0
 
     response = model_run.refloc(prediction.reference_id)["box"]
+    single_prediction = model_run.prediction_loc(
+        prediction.reference_id, prediction.annotation_id
+    )
+    assert response[0] == single_prediction
     assert len(response) == 1
     assert_box_prediction_matches_dict(response[0], TEST_BOX_PREDICTIONS[0])
 
diff --git a/tests/test_scene.py b/tests/test_scene.py
@@ -320,6 +320,7 @@ def test_scene_upload_async(dataset):
                 "new_scenes": len(scenes),
                 "ignored_scenes": 0,
                 "scenes_errored": 0,
+                "updated_scenes": 0,
             }
         },
         "job_progress": "1.00",

Original file line number	Diff line number	Diff line change
`@@ -320,6 +320,7 @@ def test_scene_upload_async(dataset):`
`320`	`320`	`"new_scenes": len(scenes),`
`321`	`321`	`"ignored_scenes": 0,`
`322`	`322`	`"scenes_errored": 0,`
	`323`	`+ "updated_scenes": 0,`
`323`	`324`	`}`
`324`	`325`	`},`
`325`	`326`	`"job_progress": "1.00",`