@@ -839,9 +839,16 @@ def characterize_data(argv=None):
839839 a date-time prefix and "_characterize_data_settings.json" postfix). This file can then be
840840 used to override the default parameter settings with user defaults in a reproducible
841841 manner via the --configuration_file option.
842- 3. pdf figure with the histogram of image sizes.
843- 4. Possibly a pdf figure with histogram of min-max intensity values for the scalar images, if any.
844- 5. Possibly a csv file listing exact duplicate images, if any. Images are considered duplicates if
842+ 3. Two or three scatterplots in pdf/png format. File format is determined based on the number of
843+ images. If more than 500,000 images the png format is used, otherwise pdf. This avoids excessively long
844+ rendering times associated with the vector graphics format which renders each individual point in the
845+ scatterplot. Preference is to use a vector graphics format which allows for resizing without loss of
846+ quality. If you require a vector graphics format even for large datasets, you will need to modify the
847+ PDF_FOMAT_THRESHOLD value in the script.
848+ Plots include: image sizes, image spacing, and possibly min-max intensity values for
849+ scalar images. Image size and spacings are 2D plots. When dealing with 3D images, information
850+ along the z axis is encoded using color.
851+ 4. Possibly a csv file listing exact duplicate images, if any. Images are considered duplicates if
845852 the intensity values are the same, header and spatial information may be different.
846853
847854 Empty lines in the resulting csv file (file names listed but nothing else in that row)
@@ -897,6 +904,12 @@ def xyz_to_index(x, y, z, thumbnail_size, tile_size):
897904 When this happens you will see a WARNING printed to the terminal output, along the lines of
898905 "ImageSeriesReader : Non uniform sampling or missing slices detected...".
899906 """
907+ # Maximal number of points for which scatterplots are saved in pdf format,
908+ # otherwise png. Threshold was deterimined empirically based on rendering
909+ # times longer than 10sec on a 2020 MacBook Pro (1.4GHz Quad core Intel i5
910+ # with 16GB RAM).
911+ PDF_FOMAT_THRESHOLD = 500000
912+
900913 # Configure argument parser for commandline arguments and set default
901914 # values.
902915 # We use two parsers, one for the optional parameters and the other for positional and
@@ -1233,14 +1246,14 @@ def xyz_to_index(x, y, z, thumbnail_size, tile_size):
12331246 size_ax .set_ylabel ("y size" )
12341247 size_fig .tight_layout ()
12351248 size_fig .savefig (
1236- f"{ os .path .splitext (args .output_file )[0 ]} _image_size_scatterplot.pdf" ,
1249+ f"{ os .path .splitext (args .output_file )[0 ]} _image_size_scatterplot.{ 'png' if len ( df ) > PDF_FOMAT_THRESHOLD else ' pdf' } " ,
12371250 bbox_inches = "tight" ,
12381251 )
12391252 spacing_ax .set_xlabel ("x spacing [mm]" )
12401253 spacing_ax .set_ylabel ("y spacing [mm]" )
12411254 spacing_fig .tight_layout ()
12421255 spacing_fig .savefig (
1243- f"{ os .path .splitext (args .output_file )[0 ]} _image_spacing_scatterplot.pdf" ,
1256+ f"{ os .path .splitext (args .output_file )[0 ]} _image_spacing_scatterplot.{ 'png' if len ( df ) > PDF_FOMAT_THRESHOLD else ' pdf' } " ,
12441257 bbox_inches = "tight" ,
12451258 )
12461259
@@ -1254,7 +1267,7 @@ def xyz_to_index(x, y, z, thumbnail_size, tile_size):
12541267 ax .set_xlabel ("min intensity" )
12551268 ax .set_ylabel ("max intensity" )
12561269 fig .savefig (
1257- f"{ os .path .splitext (args .output_file )[0 ]} _min_max_intensity_scatterplot.pdf" ,
1270+ f"{ os .path .splitext (args .output_file )[0 ]} _min_max_intensity_scatterplot.{ 'png' if len ( df ) > PDF_FOMAT_THRESHOLD else ' pdf' } " ,
12581271 bbox_inches = "tight" ,
12591272 )
12601273
0 commit comments