From cacbe9805a1719bd88b4d2343c806217b8acfee1 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Wed, 18 Mar 2020 15:40:05 +0100
Subject: [PATCH 1/3] Introduce OutputDoc and fill it for most operators with
 non-trivial returns

Introduce OutputDoc and OutputDocStr as a mechanism
for creating Returns section for Operator __call__,
 similar to InputDoc & InputDocString that were already present.

Allow for unnamed outputs

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 dali/operators/audio/nonsilence_op.cc         |  30 +--
 .../decoder/audio/audio_decoder_op.cc         |   3 +
 dali/operators/decoder/image_decoder.cc       |  20 +-
 dali/operators/generic/reshape.cc             |   8 +-
 dali/operators/generic/slice/slice.cc         |   6 +-
 dali/operators/image/crop/bbox_crop.cc        |  16 +-
 .../random/normal_distribution_op.cc          |  28 ++-
 dali/operators/reader/caffe2_reader_op.cc     |   7 +
 dali/operators/reader/caffe_reader_op.cc      |   5 +
 dali/operators/reader/coco_reader_op.cc       |  13 +-
 dali/operators/reader/file_reader_op.cc       |   6 +-
 dali/operators/reader/mxnet_reader_op.cc      |   2 +
 .../sequence/optical_flow/optical_flow.cc     |  31 +--
 dali/operators/ssd/box_encoder.cc             |  38 +--
 dali/pipeline/operator/op_schema.cc           | 103 +++++++-
 dali/pipeline/operator/op_schema.h            | 230 ++++++++++--------
 dali/python/backend_impl.cc                   |  27 +-
 dali/python/nvidia/dali/ops.py                |  95 +++++---
 docs/api.rst                                  |   1 +
 docs/data_types.rst                           |   2 +
 20 files changed, 458 insertions(+), 213 deletions(-)

diff --git a/dali/operators/audio/nonsilence_op.cc b/dali/operators/audio/nonsilence_op.cc
index c45d439ff07..6f526fe2dea 100644
--- a/dali/operators/audio/nonsilence_op.cc
+++ b/dali/operators/audio/nonsilence_op.cc
@@ -19,7 +19,7 @@
 namespace dali {
 
 DALI_SCHEMA(NonsilentRegion)
-                .DocStr(R"code(The operator performs leading and trailing silence detection in an audio buffer.
+    .DocStr(R"code(The operator performs leading and trailing silence detection in an audio buffer.
 The operator returns the beginning and length of the non-silent region by comparing short term power of the signal
 with a silence cut-off threshold. The signal is consider silence when ``short_term_power_db < cutoff_db`` with::
 
@@ -35,22 +35,24 @@ Inputs/Outputs
 Remarks
   - If ``Outputs[1] == 0``, ``Outputs[0]`` value is undefined
 )code")
-                .NumInput(1)
-                .NumOutput(detail::kNumOutputs)
-                .AddOptionalArg("cutoff_db",
-                                R"code(The threshold [dB], below which everything is considered as silence)code",
-                                -60.f)
-                .AddOptionalArg("window_length", R"code(Size of a sliding window.
+    .NumInput(1)
+    .NumOutput(detail::kNumOutputs)
+    .InputDoc(0, "audio_buffer", "1D TensorList", "Batch of audio buffers")
+    .OutputDoc(0, "start", "TensorList of int",
+               "Start positions, in samples, of nonsilent regions.")
+    .OutputDoc(1, "length", "TensorList of int", "Lengths, in samples, of nonsilent regions.")
+    .AddOptionalArg("cutoff_db",
+        R"code(The threshold [dB], below which everything is considered as silence)code", -60.f)
+    .AddOptionalArg("window_length", R"code(Size of a sliding window.
 The sliding window is used to calculate short-term power of the signal.)code", 2048)
-                .AddOptionalArg("reference_power",
-                                R"code(The reference power used for converting signal to db.
+    .AddOptionalArg("reference_power",
+                    R"code(The reference power used for converting signal to db.
 If ``reference_power`` is not provided, the maximum of the signal will be used as the reference power)code",
-                                0.f)
-                .AddOptionalArg("reset_interval",
-                                R"code(The number of samples after which the moving mean average is
+                    0.f)
+    .AddOptionalArg("reset_interval",
+                    R"code(The number of samples after which the moving mean average is
 recalculated to avoid loss of precision. If ``reset_interval == -1`` or the input type allows exact calculation,
-the average won't be reset. The default value should fit most of the use cases.)code",
-                                8192);
+the average won't be reset. The default value should fit most of the use cases.)code", 8192);
 
 DALI_REGISTER_OPERATOR(NonsilentRegion, NonsilenceOperatorCpu, CPU);
 
diff --git a/dali/operators/decoder/audio/audio_decoder_op.cc b/dali/operators/decoder/audio/audio_decoder_op.cc
index 12dbeaa6dbf..f3e5179de3e 100644
--- a/dali/operators/decoder/audio/audio_decoder_op.cc
+++ b/dali/operators/decoder/audio/audio_decoder_op.cc
@@ -30,6 +30,9 @@ This operator produces two outputs:
 )code")
   .NumInput(1)
   .NumOutput(2)
+  .OutputDoc(0, "decoded", "TensorList of int16, int32 or float", "The decoded audio recordings.")
+  .OutputDoc(1, "sampling_rate", "TensorList of float",
+             "The sampling rates corresponding to the decoded sound recordings [Hz].")
   .AddOptionalArg("sample_rate",
           "If specified, the target sample rate, in Hz, to which the audio is resampled.",
           0.0f, true)
diff --git a/dali/operators/decoder/image_decoder.cc b/dali/operators/decoder/image_decoder.cc
index 35eca425e1f..8dcad9168af 100644
--- a/dali/operators/decoder/image_decoder.cc
+++ b/dali/operators/decoder/image_decoder.cc
@@ -95,9 +95,8 @@ Output of the decoder is in `HWC` ordering.)code")
   .AddParent("ImageDecoderAttr")
   .AddParent("RandomCropAttr");
 
-
 DALI_SCHEMA(ImageDecoderSlice)
-  .DocStr(R"code(Decode images on the host with a cropping window of given size and anchor.
+    .DocStr(R"code(Decode images on the host with a cropping window of given size and anchor.
 Inputs must be supplied as 3 separate tensors in a specific order: `data`
 containing input data, `anchor` containing either normalized or absolute coordinates
 (depending on the value of `normalized_anchor`) for the starting point of the
@@ -111,9 +110,18 @@ coordinates and `WH` order for the slice arguments.
 When possible, will make use of partial decoding (e.g. libjpeg-turbo, nvJPEG).
 When not supported, will decode the whole image and then crop.
 Output of the decoder is in `HWC` ordering.)code")
-  .NumInput(3)
-  .NumOutput(1)
-  .AddParent("ImageDecoderAttr")
-  .AddParent("SliceAttr");
+    .NumInput(3)
+    .InputDoc(0, "data", "TensorList", "Batch containing input data")
+    .InputDoc(1, "anchor", "1D TensorList of float",
+              R"code(Input containing either normalized or absolute coordinates
+(depending on the value of `normalized_anchor`) for the starting point of the
+slice (x0, x1, x2, ...).)code")
+    .InputDoc(2, "shape", "1D TensorList of float",
+              R"code(Input containing either normalized or absolute coordinates
+(depending on the value of `normalized_shape`) for the dimensions of the slice
+(s0, s1, s2, ...).)code")
+    .NumOutput(1)
+    .AddParent("ImageDecoderAttr")
+    .AddParent("SliceAttr");
 
 }  // namespace dali
diff --git a/dali/operators/generic/reshape.cc b/dali/operators/generic/reshape.cc
index be99691adb1..e1e71615cfd 100644
--- a/dali/operators/generic/reshape.cc
+++ b/dali/operators/generic/reshape.cc
@@ -29,8 +29,8 @@ DALI_SCHEMA(Reshape)
   The buffer contents are not copied.)")
   .NumInput(1, 2)
   .NumOutput(1)
-  .InputDox(0, "data", "TensorList", "Data to be reshaped")
-  .InputDox(1, "shape_input", "1D TensorList of integers", "Same as `shape` keyword argument")
+  .InputDoc(0, "data", "TensorList", "Data to be reshaped")
+  .InputDoc(1, "shape_input", "1D TensorList of int", "Same as `shape` keyword argument")
   .PassThrough({{0, 0}})
   .AllowSequences()
   .SupportVolumetric()
@@ -59,8 +59,8 @@ DALI_SCHEMA(Reinterpret)
   The buffer contents are not copied.)")
   .NumInput(1, 2)
   .NumOutput(1)
-  .InputDox(0, "data", "TensorList", "Data to be reshaped")
-  .InputDox(1, "shape_input", "1D TensorList of integers", "Same as `shape` keyword argument")
+  .InputDoc(0, "data", "TensorList", "Data to be reshaped")
+  .InputDoc(1, "shape_input", "1D TensorList of int", "Same as `shape` keyword argument")
   .PassThrough({{0, 0}})
   .AllowSequences()
   .SupportVolumetric()
diff --git a/dali/operators/generic/slice/slice.cc b/dali/operators/generic/slice/slice.cc
index 882ab356c69..3b4c23ef424 100644
--- a/dali/operators/generic/slice/slice.cc
+++ b/dali/operators/generic/slice/slice.cc
@@ -27,12 +27,12 @@ with arguments `axis_names` or `axes`. By default `Slice` operator uses normaliz
 coordinates and `WH` order for the slice arguments.)code")
     .NumInput(3)
     .NumOutput(1)
-    .InputDox(0, "data", "TensorList", "Batch containing input data")
-    .InputDox(1, "anchor", "1D TensorList of floats",
+    .InputDoc(0, "data", "TensorList", "Batch containing input data")
+    .InputDoc(1, "anchor", "1D TensorList of float",
                  R"code(Input containing either normalized or absolute coordinates
 (depending on the value of `normalized_anchor`) for the starting point of the
 slice (x0, x1, x2, ...).)code")
-    .InputDox(2, "shape", "1D TensorList of floats",
+    .InputDoc(2, "shape", "1D TensorList of float",
                  R"code(Input containing either normalized or absolute coordinates
 (depending on the value of `normalized_shape`) for the dimensions of the slice
 (s0, s1, s2, ...).)code")
diff --git a/dali/operators/image/crop/bbox_crop.cc b/dali/operators/image/crop/bbox_crop.cc
index ad200fbeb29..7506e061fc1 100644
--- a/dali/operators/image/crop/bbox_crop.cc
+++ b/dali/operators/image/crop/bbox_crop.cc
@@ -149,13 +149,23 @@ relative terms, depending of whether the fixed ``crop_shape`` was used or not.
 The third and fourth output correspond to the adjusted bounding boxes and optionally
 their corresponding labels. Bounding boxes are always specified in relative coordinates.)code")
     .NumInput(1, 2)  // [boxes, labels (optional),]
-    .InputDox(
+    .InputDoc(
         0, "boxes", "2D TensorList of float",
         "Relative coordinates of the bounding boxes represented as a 2D tensor where the first "
         "dimension refers to the index of the bounding box and the second dimension refers to the "
         "index of the coordinate.")
-    .InputDox(1, "labels", "1D TensorList of integers",
+    .InputDoc(1, "labels", "1D TensorList of integers",
               "(optional) Labels associated with each of the bounding boxes.")
+    .OutputDocStr(R"code(anchor : TensorList of {batch, 2} or {batch, 3} float
+    Slice-compatible anchors [x, y, (z,)] of calculated crop windows.
+shape : TensorList of {batch, 2} or {batch, 3} float
+    Slice compatible dimensions [w, h, (d,)] of calculated crop windows.
+bboxes : 2D TensorList of float
+    Adjusted bounding boxes. Each sample ``i`` has shape ``{m_i, 4}`` representing ``m_i`` bounding boxes
+    that are valid for given crop window.
+labels : 2D TensorList of int, optional
+    Labels corresponding to bounding boxes. Each sample ``i`` has shape ``{m_i, 1}`` representing
+    ``m_i`` labels.)code")
     .NumOutput(3)  // [anchor, shape, bboxes, labels (optional),]
     .AdditionalOutputsFn([](const OpSpec &spec) {
       return spec.NumRegularInput() - 1;  // +1 if labels are provided
@@ -198,7 +208,7 @@ explicitly.)code",
 
 Value for ``min`` should satisfy ``0.0 <= min <= max``.
 
-Note: Providing ``aspect_ratio`` and ``scaling`` is incompatible with specifying `crop_shape`
+Note: Providing ``aspect_ratio`` and ``scaling`` is incompatible with specifying ``crop_shape``
 explicitly)code",
         std::vector<float>{1.f, 1.f})
     .AddOptionalArg(
diff --git a/dali/operators/random/normal_distribution_op.cc b/dali/operators/random/normal_distribution_op.cc
index fddce7041a2..0ade75962e2 100644
--- a/dali/operators/random/normal_distribution_op.cc
+++ b/dali/operators/random/normal_distribution_op.cc
@@ -21,23 +21,25 @@
 namespace dali {
 
 DALI_SCHEMA(NormalDistribution)
-                .DocStr(R"code(Creates a tensor that consists of data distributed normally.
+    .DocStr(R"code(Creates a tensor that consists of data distributed normally.
 This operator can be ran in 3 modes, which determine the shape of the output tensor:
 1. Providing an input batch to this operator results in a batch of output tensors, which have the same shape as the input tensors.
 2. Providing a custom `shape` as an argument results in an output batch, where every tensor has the same (given) shape.
 3. Providing no input arguments results in an output batch of scalars, distributed normally.)code")
-                .NumInput(0, 1)
-                .NumOutput(detail::kNumOutputs)
-                .AddOptionalArg(detail::kMean, R"code(Mean value of the distribution)code",
-                                0.f, true)
-                .AddOptionalArg(detail::kStddev,
-                                R"code(Standard deviation of the distribution)code",
-                                1.f, true)
-                .AddOptionalArg(detail::kShape,
-                                R"code(Shape of single output tensor in a batch)code",
-                                detail::kShapeDefaultValue)
-                .AddOptionalArg(arg_names::kDtype, R"code(Data type for the output)code",
-                                DALI_FLOAT);
+    .NumInput(0, 1)
+    .InputDoc(0, "data", "TensorList",
+        "If provided, the output is given the same shape as `data` (`data` contents are ignored)")
+    .NumOutput(detail::kNumOutputs)
+    .AddOptionalArg(detail::kMean, R"code(Mean value of the distribution)code",
+                    0.f, true)
+    .AddOptionalArg(detail::kStddev,
+                    R"code(Standard deviation of the distribution)code",
+                    1.f, true)
+    .AddOptionalArg(detail::kShape,
+                    R"code(Shape of single output tensor in a batch)code",
+                    detail::kShapeDefaultValue)
+    .AddOptionalArg(arg_names::kDtype, R"code(Data type for the output)code",
+                    DALI_FLOAT);
 
 DALI_REGISTER_OPERATOR(NormalDistribution, NormalDistributionCpu, CPU);
 
diff --git a/dali/operators/reader/caffe2_reader_op.cc b/dali/operators/reader/caffe2_reader_op.cc
index 7861c91f113..96506d52af4 100644
--- a/dali/operators/reader/caffe2_reader_op.cc
+++ b/dali/operators/reader/caffe2_reader_op.cc
@@ -33,6 +33,13 @@ DALI_SCHEMA(Caffe2Reader)
       int has_bbox = static_cast<int>(spec.GetArgument<bool>("bbox"));
     return img_idx + num_label_outputs + additional_inputs + has_bbox;
   })
+  .OutputDocStr(R"code(images : 1D TensorList of uint8, optional
+    encoded image data, only if ``image_available = true``.
+*labels : optional
+    One or more output batches of labels, depending on the reader configuration.
+*additional_outputs : optional
+    Additional auxiliary data tensors provided for each sample.
+  )code")
   .AddArg("path",
       R"code(List of paths to Caffe2 LMDB directories.)code",
       DALI_STRING_VEC)
diff --git a/dali/operators/reader/caffe_reader_op.cc b/dali/operators/reader/caffe_reader_op.cc
index 9d23809212d..50a33e6127a 100644
--- a/dali/operators/reader/caffe_reader_op.cc
+++ b/dali/operators/reader/caffe_reader_op.cc
@@ -26,6 +26,11 @@ DALI_SCHEMA(CaffeReader)
     auto label_available = spec.GetArgument<bool>("label_available");
     return image_available + label_available;
   })
+  .OutputDocStr(R"code(images : 1D TensorList of uint8, optional
+    encoded image data, only if ``image_available = true``.
+labels : 1D TensorList of int, optional
+    Batch of labels corresponding to images, only if ``label_available = true``.
+  )code")
   .AddArg("path",
       R"code(List of paths to Caffe LMDB directories.)code",
       DALI_STRING_VEC)
diff --git a/dali/operators/reader/coco_reader_op.cc b/dali/operators/reader/coco_reader_op.cc
index 125eb44fad2..cb60fe4ce7c 100755
--- a/dali/operators/reader/coco_reader_op.cc
+++ b/dali/operators/reader/coco_reader_op.cc
@@ -21,8 +21,17 @@ DALI_SCHEMA(COCOReader)
   .NumInput(0)
   .NumOutput(3)
   .DocStr(R"code(Read data from a COCO dataset composed of directory with images
-and an annotation files. For each image, with `m` bboxes, returns its bboxes as `(m,4)`
-Tensor (``m * [x, y, w, h]`` or ``m * [left, top, right, bottom]``) and labels as `(m,1)` Tensor (``m * category_id``).)code")
+and an annotation files. For each image ``i``, with ``m_i`` bboxes, returns its bboxes as
+``{m_i ,4}`` Tensor (``m_i * [x, y, w, h]`` or ``m_i * [left, top, right, bottom]``)
+and labels as ``{m_i, 1}`` Tensor (``m * category_id``).)code")
+.OutputDocStr(R"code(images : 1D TensorList of uint8
+    Encoded image data.s.
+bboxes : 2D TensorList of float
+    Adjusted bounding boxes. Each sample ``i`` has shape ``{m_i, 4}`` representing ``m_i`` bounding
+    boxes that are valid for given crop window.
+labels : 2D TensorList of int, optional
+    Labels corresponding to bounding boxes. Each sample ``i`` has shape ``{m_i, 1}`` representing
+    ``m_i`` labels.)code")
   .AddOptionalArg(
     "meta_files_path",
     "Path to directory with meta files containing preprocessed COCO annotations.",
diff --git a/dali/operators/reader/file_reader_op.cc b/dali/operators/reader/file_reader_op.cc
index ab86d6d7bfb..3f3cb3345c6 100644
--- a/dali/operators/reader/file_reader_op.cc
+++ b/dali/operators/reader/file_reader_op.cc
@@ -24,6 +24,8 @@ DALI_SCHEMA(FileReader)
   .DocStr("Read (Image, label) pairs from a directory")
   .NumInput(0)
   .NumOutput(2)  // (Images, Labels)
+  .OutputDoc(0, "data", "1D TensorList of uint8", "Raw file contents.")
+  .OutputDoc(1, "labels", "1D TensorList of int", "Batch of labels corresponding to files.")
   .AddArg("file_root",
       R"code(Path to a directory containing data files.
 ``FileReader`` supports flat directory structure. ``file_root`` directory should contain
@@ -33,8 +35,8 @@ directories with images in them. To obtain labels ``FileReader`` sorts directori
   .AddOptionalArg("file_list",
       R"code(Path to a text file containing rows of ``filename label`` pairs, where the filenames are
 relative to ``file_root``.
-If left empty, ``file_root`` is traversed for subdirectories (only those at one level deep from 
-``file_root``) containing files associated with the same label. When traversing subdirectories, 
+If left empty, ``file_root`` is traversed for subdirectories (only those at one level deep from
+``file_root``) containing files associated with the same label. When traversing subdirectories,
 labels are assigned consecutive numbers.)code",
       std::string())
 .AddOptionalArg("shuffle_after_epoch",
diff --git a/dali/operators/reader/mxnet_reader_op.cc b/dali/operators/reader/mxnet_reader_op.cc
index 3e798d4e33c..995873c50bb 100644
--- a/dali/operators/reader/mxnet_reader_op.cc
+++ b/dali/operators/reader/mxnet_reader_op.cc
@@ -23,6 +23,8 @@ DALI_SCHEMA(MXNetReader)
   .DocStr("Read sample data from a MXNet RecordIO.")
   .NumInput(0)
   .NumOutput(2)
+  .OutputDoc(0, "data", "1D TensorList of uint8", "Raw data buffers.")
+  .OutputDoc(1, "labels", "1D TensorList of int", "Batch of labels corresponding to the buffers.")
   .AddArg("path",
       R"code(List of paths to RecordIO files.)code",
       DALI_STRING_VEC)
diff --git a/dali/operators/sequence/optical_flow/optical_flow.cc b/dali/operators/sequence/optical_flow/optical_flow.cc
index f8cda2b0bae..e0fbee72419 100644
--- a/dali/operators/sequence/optical_flow/optical_flow.cc
+++ b/dali/operators/sequence/optical_flow/optical_flow.cc
@@ -25,28 +25,29 @@ As an optional input, operator accepts external hints for OF calculation.
 The output format of this operator matches the output format of OF driver API.
 Dali uses Turing optical flow hardware implementation: https://developer.nvidia.com/opticalflow-sdk
 )code")
-                .NumInput(1, 2)
-                .NumOutput(1)
-                .AddOptionalArg(detail::kPresetArgName, R"code(Setting quality level of OF calculation.
+    .NumInput(1, 2)
+    .InputDoc(0, "frame_seq", "TensorList of uint8", "Batch of input sequences to calculate OF")
+    .InputDoc(1, "hints", "TensorList of float", "Batch of external hints for OF")
+    .NumOutput(1)
+    .AddOptionalArg(detail::kPresetArgName, R"code(Setting quality level of OF calculation.
  0.0f ... 1.0f, where 1.0f is best quality, lowest speed)code", .0f, false)
-                .AddOptionalArg(detail::kOutputFormatArgName,
-                                R"code(Setting grid size for output vector.
+    .AddOptionalArg(detail::kOutputFormatArgName,
+                    R"code(Setting grid size for output vector.
 Value defines width of grid square (e.g. if value == 4, 4x4 grid is used).
 For values <=0, grid size is undefined. Currently only grid_size=4 is supported.)code", -1, false)
-                .AddOptionalArg(detail::kEnableTemporalHintsArgName,
-                                R"code(enabling/disabling temporal hints for sequences longer than 2 images.
+    .AddOptionalArg(detail::kEnableTemporalHintsArgName,
+                    R"code(enabling/disabling temporal hints for sequences longer than 2 images.
 They are used to speed up calculation: previous OF result in sequence is used to calculate current flow. You might
 want to use temporal hints for sequences, that don't have much changes in the scene (e.g. only moving objects))code",
                                 false, false)
-                .AddOptionalArg(detail::kEnableExternalHintsArgName,
-                                R"code(enabling/disabling external hints for OF calculation. External hints
+    .AddOptionalArg(detail::kEnableExternalHintsArgName,
+                    R"code(enabling/disabling external hints for OF calculation. External hints
 are analogous to temporal hints, only they come from external source. When this option is enabled,
-Operator requires 2 inputs.)code",
-                                false, false)
-                .AddOptionalArg(detail::kImageTypeArgName,
-                                R"code(Type of input images (RGB, BGR, GRAY))code", DALI_RGB,
-                                false)
-                .AllowSequences();
+Operator requires 2 inputs.)code", false, false)
+    .AddOptionalArg(detail::kImageTypeArgName,
+                    R"code(Type of input images (RGB, BGR, GRAY))code", DALI_RGB,
+                    false)
+    .AllowSequences();
 
 
 DALI_REGISTER_OPERATOR(OpticalFlow, OpticalFlow<GPUBackend>, GPU);
diff --git a/dali/operators/ssd/box_encoder.cc b/dali/operators/ssd/box_encoder.cc
index 502a131ce54..42b64e919b4 100644
--- a/dali/operators/ssd/box_encoder.cc
+++ b/dali/operators/ssd/box_encoder.cc
@@ -191,15 +191,24 @@ DALI_SCHEMA(BoxEncoder)
         R"code(Encodes input bounding boxes and labels using set of default boxes (anchors) passed
 during op construction. Follows algorithm described in https://arxiv.org/abs/1512.02325 and
 implemented in https://github.com/mlperf/training/tree/master/single_stage_detector/ssd
-Inputs must be supplied as two Tensors: `BBoxes` containing bounding boxes represented as
-`[l,t,r,b]`, and `Labels` containing the corresponding label for each bounding box.
-Results are two tensors: `EncodedBBoxes` containing M encoded bounding boxes as `[l,t,r,b]`,
-where M is number of anchors and `EncodedLabels` containing the corresponding label for each
+Inputs must be supplied as two Tensors: `bboxes` containing bounding boxes represented as
+`[l,t,r,b]`, and `labels` containing the corresponding label for each bounding box.
+Results are two tensors: `encoded_bboxes` containing M encoded bounding boxes as `[l,t,r,b]`,
+where M is number of anchors and `encoded_labels` containing the corresponding label for each
 encoded box.)code")
     .NumInput(2)
+    .InputDoc(0, "bboxes", "2D TensorList of float",
+              "Bounding boxes to encode in `[l, t, r, b]` format, each sample ``i`` can have shape "
+              "``{m_i, 4}`` to represent ``m_i`` boxes")
+    .InputDoc(1, "labels", "2D TensorList of int",
+              "Labels corresponding to bounding boxes, sample ``i`` should have shape ``{m_i, 1}``")
     .NumOutput(2)
+    .OutputDoc(0, "encoded_bboxes", "TensorList of {batch, M, 4} float",
+               "Batch of encoded bounding boxes.")
+    .OutputDoc(1, "encoded_labels", "TensorList of {batch, M} int",
+               "Batch of corresponding labels.")
     .AddArg("anchors",
-            R"code(Anchors to be used for encoding. List of floats in ltrb format.)code",
+            R"code(Anchors to be used for encoding. List of 4 * M floats in ltrb format.)code",
             DALI_FLOAT_VEC)
     .AddOptionalArg(
         "criteria",
@@ -208,16 +217,15 @@ encoded box.)code")
     .AddOptionalArg(
         "offset",
         R"code(Returns normalized offsets `((encoded_bboxes*scale - anchors*scale) - mean) / stds`
-in `EncodedBBoxes` using `std`, `mean` and `scale` arguments (default values are transparent).)code",
+in `encoded_bboxes` using `std`, `mean` and `scale` arguments (default values are transparent).)code",
         false)
-    .AddOptionalArg("scale",
-            R"code(Rescale the box and anchors values before offset calculation (e.g. to get back to absolute values).)code",
-            1.0f)
-    .AddOptionalArg("means",
-            R"code([x y w h] means for offset normalization.)code",
-            std::vector<float>{0.f, 0.f, 0.f, 0.f})
-    .AddOptionalArg("stds",
-            R"code([x y w h] standard deviations for offset normalization.)code",
-            std::vector<float>{1.f, 1.f, 1.f, 1.f});
+    .AddOptionalArg(
+        "scale",
+        R"code(Rescale the box and anchors values before offset calculation (e.g. to get back to absolute values).)code",
+        1.0f)
+    .AddOptionalArg("means", R"code([x y w h] means for offset normalization.)code",
+                    std::vector<float>{0.f, 0.f, 0.f, 0.f})
+    .AddOptionalArg("stds", R"code([x y w h] standard deviations for offset normalization.)code",
+                    std::vector<float>{1.f, 1.f, 1.f, 1.f});
 
 }  // namespace dali
diff --git a/dali/pipeline/operator/op_schema.cc b/dali/pipeline/operator/op_schema.cc
index 2b84721a57f..bd2ceb15c97 100644
--- a/dali/pipeline/operator/op_schema.cc
+++ b/dali/pipeline/operator/op_schema.cc
@@ -86,10 +86,111 @@ void OpSchema::CheckArgs(const OpSpec &spec) const {
   }
 }
 
-string OpSchema::Dox() const {
+string OpSchema::DocStr() const {
   return dox_;
 }
 
+OpSchema &OpSchema::InputDoc(int index, const string &name, const string &type_doc,
+                             const string &doc) {
+  CheckInputIndex(index);
+  DALI_ENFORCE(!name.empty(), "Name of the input should not be empty");
+  DALI_ENFORCE(!type_doc.empty(), "Type of the input should not be empty");
+  DALI_ENFORCE(!doc.empty(), "Doc of the input should not be empty");
+  DALI_ENFORCE(call_dox_str_.empty(),
+               "Providing docstrings for inputs is not supported when the CallDocStr was used.");
+  input_dox_set_ = true;
+  input_dox_[index] = {name, type_doc, doc};
+  return *this;
+}
+
+DLL_PUBLIC OpSchema &OpSchema::OutputDoc(int index, const string &name, const string &type_doc,
+                                         const string &doc) {
+  CheckOutputIndex(index);
+  DALI_ENFORCE(!output_fn_, "Output dox cannot be used when the OutputFn was set");
+  DALI_ENFORCE(!additional_outputs_fn_,
+               "Output doc cannot be used when the AdditionalOutputFn was set");
+  DALI_ENFORCE(!type_doc.empty(), "Type of the output should not be empty");
+  DALI_ENFORCE(!doc.empty(), "Doc of the output should not be empty");
+  DALI_ENFORCE(output_dox_str_.empty(),
+               "Providing docstrings for output is not supported when the OutputDocStr was used.");
+  output_dox_set_ = true;
+  output_dox_[index] = {name, type_doc, doc};
+  return *this;
+}
+
+DLL_PUBLIC OpSchema &OpSchema::CallDocStr(const std::string &doc, bool append_kwargs_section) {
+  DALI_ENFORCE(!doc.empty(), "The custom docstring for __call__ should not be empty.");
+
+  DALI_ENFORCE(!input_dox_set_,
+               "Providing docstring for `__call__` is not supported when docstrings for separate "
+               "inputs were set using InputDoc.");
+  DALI_ENFORCE(!output_dox_set_,
+               "Providing docstring for `__call__` is not supported when docstrings for separate "
+               "outputs were set using OutputDoc.");
+  call_dox_str_ = doc;
+  append_kwargs_section_ = append_kwargs_section;
+  return *this;
+}
+
+DLL_PUBLIC OpSchema &OpSchema::InputDocStr(const std::string &doc) {
+  DALI_ENFORCE(!doc.empty(), "The custom `Args` section for __call__ should not be empty.");
+  DALI_ENFORCE(!input_dox_set_,
+               "Providing custom `Args` section for `__call__` is not supported when docstrings "
+               "for separate inputs were set using InputDoc.");
+  input_dox_str_ = doc;
+  return *this;
+}
+
+DLL_PUBLIC OpSchema &OpSchema::OutputDocStr(const std::string &doc) {
+  DALI_ENFORCE(!doc.empty(), "The custom `Returns` section for __call__ should not be empty.");
+  DALI_ENFORCE(!output_dox_set_,
+               "Providing custom `Returns` section for `__call__` is not supported when "
+               "docstrings for separate outputs were set using OutputDoc.");
+  output_dox_str_ = doc;
+  return *this;
+}
+
+std::string OpSchema::GetCallSignatureInputs() {
+  DALI_ENFORCE(HasPerInputDoc(),
+                "Input documentation was not specified for this operator.");
+  std::stringstream result;
+  for (int i = 0; i < MinNumInput(); i++) {
+    result << input_dox_[i].name;
+    if (i < MaxNumInput() - 1) {
+      result << ", ";
+    }
+  }
+  for (int i = MinNumInput(); i < MaxNumInput(); i++) {
+    result << input_dox_[i].name << " = None";
+    if (i < MaxNumInput() - 1) {
+      result << ", ";
+    }
+  }
+  return result.str();
+}
+
+OpSchema::InOutDoc OpSchema::GetPerInputDoc(int input_idx) {
+  CheckInputIndex(input_idx);
+  DALI_ENFORCE(HasPerInputDoc(),
+                "Input documentation was not specified for this operator.");
+  DALI_ENFORCE(!input_dox_[input_idx].name.empty(),
+                make_string("Docstring for input ", input_idx,
+                            "was not set. All inputs should be documented."));
+  return input_dox_[input_idx];
+}
+
+OpSchema::InOutDoc OpSchema::GetPerOutputDoc(int output_idx) {
+  CheckOutputIndex(output_idx);
+  DALI_ENFORCE(HasPerOutputDoc(),
+                "Output documentation was not specified for this operator.");
+  DALI_ENFORCE(!output_dox_[output_idx].name.empty() || !output_dox_[output_idx].type_doc.empty(),
+                make_string("Docstring for output ", output_idx,
+                            "was not set. All outputs should be documented."));
+  return output_dox_[output_idx];
+}
+
+
+
 std::map<std::string, RequiredArgumentDef>
 OpSchema::GetRequiredArguments() const {
   auto ret = arguments_;
diff --git a/dali/pipeline/operator/op_schema.h b/dali/pipeline/operator/op_schema.h
index d9e854d910b..c5d15bd9130 100644
--- a/dali/pipeline/operator/op_schema.h
+++ b/dali/pipeline/operator/op_schema.h
@@ -93,6 +93,25 @@ class DLL_PUBLIC OpSchema {
     return *this;
   }
 
+  /**
+   * @brief Allows to set a docstring for __call__ method of Operator.
+   *
+   * The first line of the string can contain the signature that will be used
+   * in the sphinx-generated documentation, for example:
+   * "__call__(input0, input1, optional_input = None, **kwargs)\n"
+   *
+   * The doc should follow numpydoc format, and can either contain
+   * `Args`, `Returns` sections or they can be provided separately using InputDocStr
+   * and OutputDocStr respectively.
+   *
+   * If the `append_kwargs_section` is true, the docstring generator will append the Keyword args
+   * section at the end of this doc
+   *
+   * @param doc
+   * @param append_kwargs_section
+   */
+  DLL_PUBLIC OpSchema &CallDocStr(const std::string &doc, bool append_kwargs_section = true);
+
   /**
    * @brief Sets the docstring for input.
    *
@@ -101,56 +120,66 @@ class DLL_PUBLIC OpSchema {
    * If the operator specifies some range of allowed inputs with NumInput(int min, int max)
    * only the first `min` inputs are considered mandatory, the rest are optional
    *
-   * Will generate entry in `Args` section using numpydoc style:
-   * `name`: type_doc
+   * Python bindings will generate entry in `Args` section using numpydoc style:
+   * name: type_doc
    *     doc
    */
-  DLL_PUBLIC inline OpSchema &InputDox(int index, const string &name, const string &type_doc,
-                                          const string &doc) {
-    CheckInputIndex(index);
-    DALI_ENFORCE(!name.empty(), "Name of the argument should not be empty");
-    DALI_ENFORCE(call_dox_.empty(),
-                 "Providing docstrings for inputs is not supported when the CallDocStr was used.");
-    input_dox_set_ = true;
-    input_dox_[index] = {name, type_doc, doc};
-    return *this;
-  }
+  DLL_PUBLIC OpSchema &InputDoc(int index, const string &name, const string &type_doc,
+                                const string &doc);
 
   /**
-   * @brief Allows to set a docstring for __call__ method of Operator.
+   * @brief Sets the docstring for output.
    *
-   * The first line of the string can contain the signature that will be used
-   * in the sphinx-generated documentation, for example:
-   * "__call__(input0, input1, optional_input = None, **kwargs)\n"
+   * Set the documentation for output at given `index`.
+   *
+   * Works only with operators that have static number of outputs, that is
+   * use only the NumOutput() and not OutputFn() or AdditionalOutputFn().
+   *
+   * name can be empty
    *
-   * The arguments should be described using Args section and numpydoc syntax,
+   * Python bindings will generate entry in `Returns` section using numpydoc style:
+   * name: type_doc
+   *     doc
+   */
+  DLL_PUBLIC OpSchema &OutputDoc(int index, const string &name, const string &type_doc,
+                                 const string &doc);
+
+  /**
+   * @brief Set the `Args` section of `__call__` doc explicitly
+   *
+   * Incompatible with InputDoc.
+   *
+   * The arguments should be described using numpydoc syntax,
    * with comments indented by 4 spaces, for example:
    * """
-   * Args
-   * ----
-   * `input0`: Type of input
+   * input0: Type of input
    *     This is the first input
-   * `input1`: TensorList of some kind
+   * input1: TensorList of some kind
    *     This is second input
-   * `optional_input`: TensorList, optional
+   * optional_input: TensorList, optional
    *     This is optional input
+   * """
    *
-   * If the `append_kwargs_section` is true, the docstring generator will append the Keyword args
-   * section at the end of this doc
-   *
-   * @param doc
-   * @param append_kwargs_section
    */
-  DLL_PUBLIC inline OpSchema &CallDocStr(const string &doc, bool append_kwargs_section = true) {
-    DALI_ENFORCE(!doc.empty(), "The custom docstring for __call__ should not be empty.");
+  DLL_PUBLIC OpSchema &InputDocStr(const std::string &doc);
 
-    DALI_ENFORCE(!input_dox_set_,
-                 "Providing docstring for `__call__` is not supported when docstrings for separate "
-                 "inputs were set using InputDox.");
-    call_dox_ = doc;
-    append_kwargs_section_ = append_kwargs_section;
-    return *this;
-  }
+  /**
+   * @brief Set the `Returns` section of `__call__` doc explicitly
+   *
+   * Incompatible with OutputDoc.
+   *
+   * The arguments should be described using numpydoc syntax,
+   * with comments indented by 4 spaces, for example:
+   * """
+   * output0: TensorList of type
+   *     This is the first output
+   * output1: TensorList of some kind
+   *     This is second output
+   * optional_output: TensorList, optional
+   *     This is optional output
+   * """
+   */
+  DLL_PUBLIC OpSchema &OutputDocStr(const std::string &doc);
 
   /**
    * @brief Sets a function that infers the number of outputs this
@@ -213,6 +242,7 @@ class DLL_PUBLIC OpSchema {
   DLL_PUBLIC inline OpSchema& NumOutput(int n) {
     DALI_ENFORCE(n >= 0);
     num_output_ = n;
+    output_dox_.resize(n);
     return *this;
   }
 
@@ -514,12 +544,12 @@ class DLL_PUBLIC OpSchema {
     return parents_;
   }
 
-  DLL_PUBLIC string Dox() const;
+  DLL_PUBLIC string DocStr() const;
 
   /**
    * @brief Return true wether the default input docs can be used
    */
-  DLL_PUBLIC bool CanUseAutoInputDox() {
+  DLL_PUBLIC bool CanUseAutoInputDoc() {
     return !disable_auto_input_dox_ && MaxNumInput() <= 1;
   }
 
@@ -532,70 +562,70 @@ class DLL_PUBLIC OpSchema {
    *
    * Should be considered as highest preference
    */
-  DLL_PUBLIC bool HasCallDox() {
-    return !call_dox_.empty();
+  DLL_PUBLIC bool HasCallDocStr() {
+    return !call_dox_str_.empty();
   }
 
-  DLL_PUBLIC std::string GetCallDox() {
-    DALI_ENFORCE(HasCallDox(), "__call__ docstring was not set");
-    return call_dox_;
+  DLL_PUBLIC std::string GetCallDocStr() {
+    DALI_ENFORCE(HasCallDocStr(), "__call__ docstring was not set");
+    return call_dox_str_;
+  }
+
+
+  DLL_PUBLIC bool HasInputDocStr() {
+    return !input_dox_str_.empty();
+  }
+
+  DLL_PUBLIC std::string GetInputDocStr() {
+    DALI_ENFORCE(HasCallDocStr(), "Input docstring was not set");
+    return input_dox_str_;
+  }
+
+  DLL_PUBLIC bool HasOutputDocStr() {
+    return !output_dox_str_.empty();
+  }
+
+  DLL_PUBLIC std::string GetOutputDocStr() {
+    DALI_ENFORCE(HasOutputDocStr(), "Output docstring was not set");
+    return output_dox_str_;
   }
 
   /**
-   * @brief Check if this operator has input docstrings provided
+   * @brief Check if this operator has per-input docstrings provided
    */
-  DLL_PUBLIC bool HasInputDox() {
+  DLL_PUBLIC bool HasPerInputDoc() {
     return input_dox_set_;
   }
 
   /**
-   * @brief List all the inputs that should appear in `__call__` signature based on the input
-   *        docs that were specified. Requires HasInputDox() to return true
-   *
+   * @brief Check if this operator has per-output docstrings provided
    */
-  DLL_PUBLIC std::string GetCallSignatureInputs() {
-    DALI_ENFORCE(HasInputDox(),
-                 "Input documentation was not specified for this operator.");
-    std::stringstream result;
-    for (int i = 0; i < MinNumInput(); i++) {
-      result << input_dox_[i].name;
-      if (i < MaxNumInput() - 1) {
-        result << ", ";
-      }
-    }
-    for (int i = MinNumInput(); i < MaxNumInput(); i++) {
-      result << input_dox_[i].name << " = None";
-      if (i < MaxNumInput() - 1) {
-        result << ", ";
-      }
-    }
-    return result.str();
+  DLL_PUBLIC bool HasPerOutputDoc() {
+    return output_dox_set_;
   }
 
-  DLL_PUBLIC std::string GetInputName(int input_idx) {
-    CheckInputIndex(input_idx);
-    DALI_ENFORCE(HasInputDox(),
-                 "Input documentation was not specified for this operator.");
-    DALI_ENFORCE(!input_dox_[input_idx].name.empty(),
-                 make_string("Docstring for input ", input_idx,
-                             "was not set. All inputs should be documented."));
-    return input_dox_[input_idx].name;
-  }
+  /**
+   * @brief List all the inputs that should appear in `__call__` signature based on the input
+   *        docs that were specified. Requires HasPerInputDoc() to return true
+   *
+   */
+  DLL_PUBLIC std::string GetCallSignatureInputs();
 
-  DLL_PUBLIC std::string GetInputType(int input_idx) {
-    CheckInputIndex(input_idx);
-    DALI_ENFORCE(HasInputDox(),
-                 "Input documentation was not specified for this operator.");
-    return input_dox_[input_idx].type_doc;
-  }
+  struct InOutDoc {
+    std::string name = {};
+    std::string type_doc = {};
+    std::string doc = {};
+  };
 
-  DLL_PUBLIC std::string GetInputDox(int input_idx) {
-    CheckInputIndex(input_idx);
-    DALI_ENFORCE(HasInputDox(),
-                 "Input documentation was not specified for this operator.");
-    return input_dox_[input_idx].doc;
-  }
+  /**
+   * @brief Get the docs for given input, must be specified with InputDoc
+   */
+  DLL_PUBLIC InOutDoc GetPerInputDoc(int input_idx);
 
+  /**
+   * @brief Get the docs for given output, must be specified with OutputDoc
+   */
+  DLL_PUBLIC InOutDoc GetPerOutputDoc(int output_idx);
 
   DLL_PUBLIC inline int MaxNumInput() const {
     return max_num_input_;
@@ -746,8 +776,14 @@ class DLL_PUBLIC OpSchema {
 
   inline void CheckInputIndex(int index) const {
     DALI_ENFORCE(index >= 0 && index < max_num_input_,
-      "Output index (=" + std::to_string(index) +  ") out of range [0.." +
-      std::to_string(max_num_input_) + ").\nWas NumInput called?");
+                 make_string("Input index `", index, "` out of range [0..", max_num_input_,
+                             "). Was NumInput called?"));
+  }
+
+  inline void CheckOutputIndex(int index) const {
+    DALI_ENFORCE(index >= 0 && index < num_output_,
+                 make_string("Output index `", index, "` out of range [0..", num_output_,
+                             "). Was NumOutput called?"));
   }
 
   /**
@@ -770,19 +806,20 @@ class DLL_PUBLIC OpSchema {
 
   bool disable_auto_input_dox_ = false;
 
-  struct InputDoc {
-    std::string name = {};
-    std::string type_doc = {};
-    std::string doc = {};
-  };
-  std::vector<InputDoc> input_dox_ = {};
+  std::vector<InOutDoc> input_dox_ = {};
+  std::vector<InOutDoc> output_dox_ = {};
   bool input_dox_set_ = false;
+  bool output_dox_set_ = false;
 
   // Custom docstring, if not empty should be used in place of input_dox_ descriptions
-  std::string call_dox_ = {};
+  std::string call_dox_str_ = {};
+
+  // Custom docstrings for `Args` and `Returns` sections
+  std::string input_dox_str_ = {};
+  std::string output_dox_str_ = {};
 
   // Whether to append kwargs section to __call__ docstring. On by default,
-  // can be turned off for call_dox_ specified manually
+  // can be turned off for call_dox_str_ specified manually
   bool append_kwargs_section_ = true;
 
   SpecFunc output_fn_, in_place_fn_, additional_outputs_fn_;
@@ -857,3 +894,4 @@ inline T OpSchema::GetDefaultValueForArgument(const std::string &s) const {
 }  // namespace dali
 
 #endif  // DALI_PIPELINE_OPERATOR_OP_SCHEMA_H_
+
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
index 264bd134a8a..01cb05cbfb9 100644
--- a/dali/python/backend_impl.cc
+++ b/dali/python/backend_impl.cc
@@ -1140,18 +1140,29 @@ PYBIND11_MODULE(backend_impl, m) {
   m.def("GetSchema", &GetSchema, py::return_value_policy::reference);
 
   py::class_<OpSchema>(m, "OpSchema")
-    .def("Dox", &OpSchema::Dox)
-    .def("CanUseAutoInputDox", &OpSchema::CanUseAutoInputDox)
+    .def("DocStr", (std::string (OpSchema::*)() const)&OpSchema::DocStr)
+    .def("CanUseAutoInputDoc", &OpSchema::CanUseAutoInputDoc)
     .def("AppendKwargsSection", &OpSchema::AppendKwargsSection)
-    .def("HasCallDox", &OpSchema::HasCallDox)
-    .def("GetCallDox", &OpSchema::GetCallDox)
-    .def("HasInputDox", &OpSchema::HasInputDox)
+    .def("HasCallDocStr", &OpSchema::HasCallDocStr)
+    .def("GetCallDocStr", &OpSchema::GetCallDocStr)
+    .def("HasInputDocStr", &OpSchema::HasInputDocStr)
+    .def("GetInputDocStr", &OpSchema::GetInputDocStr)
+    .def("HasOutputDocStr", &OpSchema::HasOutputDocStr)
+    .def("GetOutputDocStr", &OpSchema::GetOutputDocStr)
+    .def("HasPerInputDoc", &OpSchema::HasPerInputDoc)
     .def("GetCallSignatureInputs", &OpSchema::GetCallSignatureInputs)
-    .def("GetInputName", &OpSchema::GetInputName)
-    .def("GetInputType", &OpSchema::GetInputType)
-    .def("GetInputDox", &OpSchema::GetInputDox)
+    .def("GetPerInputDoc", [](OpSchema &schema, int input_idx) {
+        const auto &info = schema.GetPerInputDoc(input_idx);
+        return std::make_tuple(info.name, info.type_doc, info.doc);
+     })
+    .def("HasPerOutputDoc", &OpSchema::HasPerOutputDoc)
+    .def("GetPerOutputDoc", [](OpSchema &schema, int output_idx) {
+        const auto &info = schema.GetPerOutputDoc(output_idx);
+        return std::make_tuple(info.name, info.type_doc, info.doc);
+     })
     .def("MaxNumInput", &OpSchema::MaxNumInput)
     .def("MinNumInput", &OpSchema::MinNumInput)
+    .def("NumOutput", (int (OpSchema::*)() const)&OpSchema::NumOutput)
     .def("HasOutputFn", &OpSchema::HasOutputFn)
     .def("CalculateOutputs", &OpSchema::CalculateOutputs)
     .def("CalculateAdditionalOutputs", &OpSchema::CalculateAdditionalOutputs)
diff --git a/dali/python/nvidia/dali/ops.py b/dali/python/nvidia/dali/ops.py
index 27db9f72bb7..47fa7f94301 100644
--- a/dali/python/nvidia/dali/ops.py
+++ b/dali/python/nvidia/dali/ops.py
@@ -42,11 +42,29 @@ def _setup_cupy():
 _gpu_ops = set({})
 _mixed_ops = set({})
 
+_args_header = """
+Args
+----
+"""
+
+_kwargs_header = """
+Keyword args
+------------
+"""
+
+_returns_header = """
+Returns
+-------
+"""
+
 def _numpydoc_formatter(name, type, doc, optional = False):
     indent = "\n" + " " * 4
     if optional:
         type += ", optional"
-    return "`{}` : {}{}{}".format(name, type, indent, doc.replace("\n", indent))
+    if name:
+        return "{} : {}{}{}".format(name, type, indent, doc.replace("\n", indent))
+    else:
+        return "{}{}{}".format(type, indent, doc.replace("\n", indent))
 
 def _get_kwargs(schema, only_tensor = False):
     """
@@ -94,7 +112,7 @@ def _docstring_generator(cls):
             ret +=". Use `" + use_instead + "` instead."
         ret += "\n\n"
 
-    ret += schema.Dox()
+    ret += schema.DocStr()
     ret += '\n'
 
     if schema.IsSequenceOperator():
@@ -122,10 +140,7 @@ def _docstring_generator(cls):
         ret += " * " + dev + "\n"
     ret += "\n"
 
-    ret += """
-Keyword args
-------------
-"""
+    ret += _kwargs_header
     ret += _get_kwargs(schema)
     return ret
 
@@ -134,6 +149,40 @@ def _supported_layouts_str(supported_layouts):
         return ""
     return " (" + ", ".join(["\'" + str(layout) + "\'" for layout in supported_layouts]) + ")"
 
+def _docstring_get_args(op_name):
+    """ Generate Args section for __call__ """
+    schema = _b.GetSchema(op_name)
+    ret = _args_header
+    for i in range(schema.MaxNumInput()):
+        optional = i >= schema.MinNumInput()
+        name, type_doc, doc = schema.GetPerInputDoc(i)
+        input_type_str = type_doc + _supported_layouts_str(schema.GetSupportedLayouts(i))
+        ret += _numpydoc_formatter(name, input_type_str, doc, optional)
+        ret += "\n"
+    return ret
+
+def _docstring_get_kwargs(op_name):
+    """ Generate Kwargs section for __call__ """
+    schema = _b.GetSchema(op_name)
+    return _kwargs_header + _get_kwargs(schema, True)
+
+def _docstring_get_returns(op_name):
+    """ Generate Returns section for __call__ """
+    schema = _b.GetSchema(op_name)
+    if schema.HasPerOutputDoc():
+        ret = _returns_header
+        for i in range(schema.NumOutput()):
+            name, type_doc, doc = schema.GetPerOutputDoc(i)
+            output_type_str = type_doc
+            ret += _numpydoc_formatter(name, output_type_str, doc)
+            ret += "\n"
+        return ret
+    elif schema.HasOutputDocStr():
+        return _returns_header + schema.GetOutputDocStr()
+    else:
+        return ""
+
+
 def _docstring_prefix_from_inputs(op_name):
     """
         Generate start of the docstring for `__call__` of Operator `op_name`
@@ -146,17 +195,6 @@ def _docstring_prefix_from_inputs(op_name):
     ret = "__call__(" + schema.GetCallSignatureInputs() + ", **kwargs)\n"
     # __call__ docstring
     ret += "\nOperator call to be used in `define_graph` step.\n"
-    # Args section
-    ret += """
-Args
-----
-"""
-    for i in range(schema.MaxNumInput()):
-        optional = i >= schema.MinNumInput()
-        input_type_str = schema.GetInputType(i) + _supported_layouts_str(schema.GetSupportedLayouts(i))
-        ret += _numpydoc_formatter(schema.GetInputName(i), input_type_str, schema.GetInputDox(i), optional)
-        ret += "\n"
-    ret += "\n"
     return ret
 
 def _docstring_prefix_auto(op_name):
@@ -175,9 +213,8 @@ def _docstring_prefix_auto(op_name):
 
 Operator call to be used in `define_graph` step.
 
-Args
-----
 """
+        ret += _args_header
         dox = "Input to the operator.\n"
         fmt  = "TensorList" + _supported_layouts_str(schema.GetSupportedLayouts(0))
         ret += _numpydoc_formatter("data", fmt, dox, optional=False)
@@ -190,23 +227,19 @@ def _docstring_generator_call(op_name):
         Generate full docstring for `__call__` of Operator `op_name`.
     """
     schema = _b.GetSchema(op_name)
-    if schema.HasCallDox():
-        ret = schema.GetCallDox()
-    elif schema.HasInputDox():
+    if schema.HasCallDocStr():
+        ret = schema.GetCallDocStr()
+        ret += _docstring_get_args(op_name)
+    elif schema.HasPerInputDoc():
         ret =_docstring_prefix_from_inputs(op_name)
-    elif schema.CanUseAutoInputDox():
+        ret += _docstring_get_args(op_name)
+    elif schema.CanUseAutoInputDoc():
         ret = _docstring_prefix_auto(op_name)
     else:
         ret = "Please refer to class :meth:`nvidia.dali.ops." + op_name + "` for full documentation.\n"
     if schema.AppendKwargsSection():
-        # Kwargs section
-        tensor_kwargs = _get_kwargs(schema, only_tensor = True)
-        if tensor_kwargs:
-            ret += """
-Keyword Args
-------------
-"""
-            ret += tensor_kwargs
+        ret += _docstring_get_kwargs(op_name)
+    ret += _docstring_get_returns(op_name)
     return ret
 
 class _OpCounter(object):
diff --git a/docs/api.rst b/docs/api.rst
index 3c7411f569f..d982f53f22c 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -6,6 +6,7 @@ deep learning frameworks. This part of the documentation contains the detailed d
 API.
 
 .. currentmodule:: nvidia.dali.pipeline
+
 `Pipeline <pipeline.rst>`_ section describes the :class:`Pipeline` class - the central and most
 important part of every program using DALI.
 
diff --git a/docs/data_types.rst b/docs/data_types.rst
index c357000d1ab..35abdad7c00 100644
--- a/docs/data_types.rst
+++ b/docs/data_types.rst
@@ -2,8 +2,10 @@ Types
 =====
 
 .. _TensorList:
+
 TensorList
 ----------
+
 .. currentmodule:: nvidia.dali.pipeline
 
 TensorList represents a batch of tensors. TensorLists are the return values of `Pipeline.run`

From 92e706b5a6ccec49bb82f35ccc0dafee289c87b8 Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Fri, 3 Apr 2020 15:39:54 +0200
Subject: [PATCH 2/3] Docstring guid

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 STYLE_GUIDE.md | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
index 685fdc31a23..4bc41b37e93 100644
--- a/STYLE_GUIDE.md
+++ b/STYLE_GUIDE.md
@@ -27,8 +27,8 @@ and introduces ugly code like `foo(&bar)` or `(*buf)[i]`.
 
 ### Test suites naming guide
 
-We use GTest for most of testing code in DALI. Names of TestSuites should start with a capital letter and end with `Test`. 
-Additionally, both suite and case name mustn't contain underscores (`_`). 
+We use GTest for most of testing code in DALI. Names of TestSuites should start with a capital letter and end with `Test`.
+Additionally, both suite and case name mustn't contain underscores (`_`).
 For details on the latter, cf. [GTest FAQ](https://github.com/google/googletest/blob/master/googletest/docs/faq.md#why-should-test-suite-names-and-test-names-not-contain-underscore).
 Examples:
 ```
@@ -55,6 +55,47 @@ The order of the arguments is following memcpy semantics.
 
 ### Documentation
 
+#### Operator documentation
+
+DALI Operators are using docstrings written using [rst](https://docutils.sourceforge.io/docs/user/rst/quickref.html)
+with [numpydoc syntax](https://numpydoc.readthedocs.io/en/latest/format.html).
+
+The inputs, outputs and argument inputs that represent passing batches of tensors should
+be documented in following format:
+
+```
+<name> : sample shape: <shape info string>, <type>
+    <description>
+```
+
+for example:
+
+```
+images : sample shape: [H, W, C], uint8
+    decoded images.
+```
+
+In case when one of the `InputDox`, `AddArg` or `AddOptionalArg` is used to provide the documentation,
+the formatting above is automatically generated.
+
+When provided as a string through `CallDocStr`, the formatting should be maintained by hand.
+
+The `<shape info string>` should describe the shape of sample in that batch,
+using square brackets `[]`, for example `[H, W, C]`, `[time, frequency]`, `[X, Y] or [X, Y, Z]`
+and should not contain explicit batch size.
+
+The `<type>` should be singular form description of possible types, for example `int`, `uint8`,
+`float`, `uint8 or float`.
+
+In Operator docstrings, double backticks are preferred over single backticks where suitable:
+
+```
+``var_name`` - preferred
+`var_name`
+```
+
+#### C++ documentation
+
 DALI uses Doxygen for C++ code documentation with Javadoc-styled comments:
 
 ```

From 956fcf849e2c0c7de457ab6b3dc20b43c279427f Mon Sep 17 00:00:00 2001
From: Krzysztof Lecki <klecki@nvidia.com>
Date: Fri, 3 Apr 2020 18:14:42 +0200
Subject: [PATCH 3/3] Add guide, some auto generation of proposed format

TODO: Incorporate layouts, maybe unify them?

Adjust everything

Signed-off-by: Krzysztof Lecki <klecki@nvidia.com>
---
 STYLE_GUIDE.md                      |  7 +++--
 dali/pipeline/operator/op_schema.cc | 12 ++++++++
 dali/pipeline/operator/op_schema.h  | 44 +++++++++++++++++++++++------
 dali/python/backend_impl.cc         |  1 +
 dali/python/nvidia/dali/ops.py      |  3 +-
 dali/python/nvidia/dali/types.py    | 15 ++++++----
 6 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
index 4bc41b37e93..34e3e8ed26e 100644
--- a/STYLE_GUIDE.md
+++ b/STYLE_GUIDE.md
@@ -75,10 +75,11 @@ images : sample shape: [H, W, C], uint8
     decoded images.
 ```
 
-In case when one of the `InputDox`, `AddArg` or `AddOptionalArg` is used to provide the documentation,
-the formatting above is automatically generated.
+In case when one of the `InputDoc`, `OutputDoc`, `AddArg` or `AddOptionalArg` is used to provide
+the documentation, the formatting above is automatically generated.
 
-When provided as a string through `CallDocStr`, the formatting should be maintained by hand.
+When provided as a string through `CallDocStr`, `InputDocStr` or `OutputDocStr` the formatting
+should be manually maintained.
 
 The `<shape info string>` should describe the shape of sample in that batch,
 using square brackets `[]`, for example `[H, W, C]`, `[time, frequency]`, `[X, Y] or [X, Y, Z]`
diff --git a/dali/pipeline/operator/op_schema.cc b/dali/pipeline/operator/op_schema.cc
index bd2ceb15c97..2d119443fab 100644
--- a/dali/pipeline/operator/op_schema.cc
+++ b/dali/pipeline/operator/op_schema.cc
@@ -234,6 +234,18 @@ DALIDataType OpSchema::GetArgumentType(const std::string &name) const {
   }
 }
 
+std::string OpSchema::GetArgumentShape(const std::string &name) const {
+  DALI_ENFORCE(HasArgument(name), "Argument \"" + name +
+      "\" is not supported by operator \"" + this->name() + "\".");
+  if (HasRequiredArgument(name)) {
+    return GetRequiredArguments().at(name).shape;
+  } else {
+    // optional argument
+    return GetOptionalArguments().at(name).shape;
+  }
+}
+
+
 bool OpSchema::HasArgumentDefaultValue(const std::string &name) const {
   DALI_ENFORCE(HasArgument(name, true), "Argument \"" + name +
       "\" is not supported by operator \"" + this->name() + "\".");
diff --git a/dali/pipeline/operator/op_schema.h b/dali/pipeline/operator/op_schema.h
index c5d15bd9130..02e606d0677 100644
--- a/dali/pipeline/operator/op_schema.h
+++ b/dali/pipeline/operator/op_schema.h
@@ -39,12 +39,14 @@ class OpSpec;
 struct RequiredArgumentDef {
   std::string doc;
   DALIDataType dtype;
+  std::string shape;
 };
 
 struct DefaultedArgumentDef {
   std::string doc;
   DALIDataType dtype;
   Value *default_value;
+  std::string shape;
 };
 
 class DLL_PUBLIC OpSchema {
@@ -152,11 +154,11 @@ class DLL_PUBLIC OpSchema {
    * The arguments should be described using numpydoc syntax,
    * with comments indented by 4 spaces, for example:
    * """
-   * input0: Type of input
+   * input0: sample shape: [X, Y], element_type
    *     This is the first input
-   * input1: TensorList of some kind
+   * input1: sample shape: any shape, element_type
    *     This is second input
-   * optional_input: TensorList, optional
+   * optional_input:  sample shape: [X], element_type, optional
    *     This is optional input
    * """
    *
@@ -317,12 +319,27 @@ class DLL_PUBLIC OpSchema {
   /**
    * @brief Adds a required argument to op with its type
    */
+
+  /**
+   * @brief  Adds a required argument to op with its type
+   *
+   * @param s Argument name
+   * @param doc docstring for given argument
+   * @param dtype type of the argument (used for docstring generation)
+   * @param enable_tensor_input If given argument accepts tensors, which means it is also
+   *                            an Argument Input.
+   * @param shape If this is Argument Input optional shape can be provided.
+   */
   DLL_PUBLIC inline OpSchema& AddArg(const std::string &s,
                                      const std::string &doc,
                                      const DALIDataType dtype,
-                                     bool enable_tensor_input = false) {
+                                     bool enable_tensor_input = false,
+                                     const std::string &shape = "") {
+    DALI_ENFORCE(enable_tensor_input || shape.empty(),
+                 make_string("Shape cannot be provided for required argument ", s,
+                             " which doesn't accept tensor inputs."));
     CheckArgument(s);
-    arguments_[s] = {doc, dtype};
+    arguments_[s] = {doc, dtype, shape};
     if (enable_tensor_input) {
       tensor_arguments_.insert(s);
     }
@@ -431,9 +448,10 @@ class DLL_PUBLIC OpSchema {
   DLL_PUBLIC inline OpSchema& AddOptionalArg(const std::string &s,
                                      const std::string &doc,
                                      std::nullptr_t,
-                                     bool enable_tensor_input = false) {
+                                     bool enable_tensor_input = false,
+                                     const std::string &shape = "") {
     CheckArgument(s);
-    optional_arguments_[s] = {doc, type2id<T>::value, nullptr};
+    optional_arguments_[s] = {doc, type2id<T>::value, nullptr, shape};
     if (enable_tensor_input) {
       tensor_arguments_.insert(s);
     }
@@ -449,10 +467,11 @@ class DLL_PUBLIC OpSchema {
   AddOptionalArg(const std::string &s,
                  const std::string &doc,
                  T default_value,
-                 bool enable_tensor_input = false) {
+                 bool enable_tensor_input = false,
+                 const std::string &shape = "") {
     CheckArgument(s);
     auto to_store = Value::construct(default_value);
-    optional_arguments_[s] = {doc, type2id<T>::value, to_store.get()};
+    optional_arguments_[s] = {doc, type2id<T>::value, to_store.get(), shape};
     optional_arguments_unq_.push_back(std::move(to_store));
     if (enable_tensor_input) {
       tensor_arguments_.insert(s);
@@ -747,6 +766,13 @@ class DLL_PUBLIC OpSchema {
    */
   DLL_PUBLIC DALIDataType GetArgumentType(const std::string &name) const;
 
+  /**
+   * @brief Get string representing a shape of argument input of given name.
+   *
+   * If the input is not accepting tensor inputs (argument input) the returned string is empty.
+   */
+  DLL_PUBLIC std::string GetArgumentShape(const std::string &name) const;
+
   /**
    * @brief Check if the argument has a default value.
    *        Required arguments always return false.
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
index 01cb05cbfb9..9e53606f9cb 100644
--- a/dali/python/backend_impl.cc
+++ b/dali/python/backend_impl.cc
@@ -1170,6 +1170,7 @@ PYBIND11_MODULE(backend_impl, m) {
     .def("CheckArgs", &OpSchema::CheckArgs)
     .def("GetArgumentDox", &OpSchema::GetArgumentDox)
     .def("GetArgumentType", &OpSchema::GetArgumentType)
+    .def("GetArgumentShape", &OpSchema::GetArgumentShape)
     .def("HasArgumentDefaultValue", &OpSchema::HasArgumentDefaultValue)
     .def("GetArgumentDefaultValueString", &OpSchema::GetArgumentDefaultValueString)
     .def("GetArgumentNames", &OpSchema::GetArgumentNames)
diff --git a/dali/python/nvidia/dali/ops.py b/dali/python/nvidia/dali/ops.py
index 47fa7f94301..ab866ac9695 100644
--- a/dali/python/nvidia/dali/ops.py
+++ b/dali/python/nvidia/dali/ops.py
@@ -83,7 +83,8 @@ def _get_kwargs(schema, only_tensor = False):
         if not only_tensor or schema.IsTensorArgument(arg):
             arg_name_doc = arg
             dtype = schema.GetArgumentType(arg)
-            type_name = _type_name_convert_to_string(dtype, is_tensor = only_tensor)
+            shape = schema.GetArgumentShape(arg) if only_tensor else None
+            type_name = _type_name_convert_to_string(dtype, is_tensor = only_tensor, sample_shape = shape)
             if schema.IsArgumentOptional(arg):
                 type_name += ", optional"
                 if schema.HasArgumentDefaultValue(arg):
diff --git a/dali/python/nvidia/dali/types.py b/dali/python/nvidia/dali/types.py
index 319a3211ce1..b8888fbaa21 100644
--- a/dali/python/nvidia/dali/types.py
+++ b/dali/python/nvidia/dali/types.py
@@ -71,13 +71,18 @@ def _not_implemented(val):
     _known_types[DALIDataType._FEATURE_DICT] = ("dict of (string, nvidia.dali.tfrecord.Feature)",
             _not_implemented)
 
-def _type_name_convert_to_string(dtype, is_tensor):
+def _type_name_convert_to_string(dtype, is_tensor, sample_shape = None):
+    if not is_tensor and sample_shape is not None:
+        raise RuntimeError("Cannot place shape information in scalar type signature")
     if dtype in _known_types:
-        ret = _known_types[dtype][0]
-        if is_tensor:
-            ret = "TensorList of " + ret
+        dtype_str = _known_types[dtype][0]
+        ret = dtype_str
+        if is_tensor and sample_shape:
+            ret = "sample shape: {}, {}".format(sample_shape, dtype_str)
+        elif is_tensor:
+            ret = "TensorList, {}".format(dtype_str)
         elif dtype in _vector_types:
-            ret = ret + " or list of " + _known_types[dtype][0]
+            ret = dtype_str + " or list of " + dtype_str
         return ret
     else:
         raise RuntimeError(str(dtype) + " does not correspond to a known type.")