diff --git a/.clang-tidy-filter b/.clang-tidy-filter
new file mode 100644
index 00000000000..efe75e1118d
--- /dev/null
+++ b/.clang-tidy-filter
@@ -0,0 +1,4 @@
+LIST OF HEADER FILES TO BE IGNORED BY CLANG TIDY
+include/dali/util/half.hpp
+include/dali/core/tensor_shape.h
+include/dali/core/access_order.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 170e5cc14da..b034d958965 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,9 +274,52 @@ endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-register -Wsign-compare")
+  execute_process(COMMAND
+      find ${CMAKE_SOURCE_DIR}/dali ${CMAKE_SOURCE_DIR}/include
+      -type f
+      -regex ".*\\.\\(h\\|hpp\\|cuh\\|inl\\)$"
+      OUTPUT_VARIABLE ALL_HEADERS
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+      #ECHO_OUTPUT_VARIABLE
+      #COMMAND_ECHO STDOUT)
+  # string(REGEX REPLACE "\n$" "|" ALL_HEADERS "${ALL_HEADERS}")
+  # string(REPLACE "\n" ";" ALL_HEADERS_LIST ${ALL_HEADERS})
+  # Remove the absolute path so we can apply filter file easily.
+  string(REPLACE "${CMAKE_SOURCE_DIR}/" "" ALL_HEADERS ${ALL_HEADERS})
+  # string(REGEX REPLACE "\\." "\." ALL_HEADERS "${ALL_HEADERS}")
+  message(STATUS "ALL_HEADERS is set to ${ALL_HEADERS}")
+  set(TEST_LIST "include/dali/core/tensor_shape\\.h|include/dali/core/util\\.h")
+
+  file(READ ".clang-tidy-filter" CLANG_TIDY_LINE_FILTER)
+  # make this a list
+  string(REPLACE "\n" ";" CLANG_TIDY_LINE_FILTER ${CLANG_TIDY_LINE_FILTER})
+  foreach(FILTERED IN ITEMS ${CLANG_TIDY_LINE_FILTER})
+    string(REPLACE ${FILTERED} "" ALL_HEADERS ${ALL_HEADERS})
+  endforeach()
+  # nice and consistent argument order, we need to strip
+  string(STRIP ${ALL_HEADERS} ALL_HEADERS)
+
+  # Format into the regex - make every line an alternative, swap . into \. and
+  # We may have some empty newlines after the filtering
+  string(REGEX REPLACE "(\n)+" "|" ALL_HEADERS ${ALL_HEADERS})
+  string(REPLACE "." "\\." ALL_HEADERS ${ALL_HEADERS})
+  message(STATUS "ALL_HEADERS is set to ${ALL_HEADERS}")
+
 
   # Settings specific to builds where Clang is used for compiling CUDA code
   if (DALI_CLANG_ONLY)
+    # Full power of clang-tidy
+    # Remove some that appera in fp16 and tests, and try again
+    set(CMAKE_CXX_CLANG_TIDY "clang-tidy"
+    "-header-filter=${ALL_HEADERS}")
+      # "--extra-arg-before=--system-header-prefix=include/dali/util/half.hpp")
+      # "-warnings-as-errors=-*,bugprone-*,misc-*,modernize-*,clang-analyzer-*"
+      # "-checks=-clang-analyzer-security.FloatLoopCounter,-clang-analyzer-core.DivideZero,-clang-analyzer-core.UndefinedBinaryOperatorResult"
+      # "-header-filter=.*")
+      # Line filter filters too much
+      # "-line-filter=${CLANG_TIDY_LINE_FILTER}")
+    message(STATUS "CMAKE_CXX_CLANG_TIDY is set to ${CMAKE_CXX_CLANG_TIDY}")
+
     # Enable SFINAE with ellipsis in device code
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fcuda-allow-variadic-functions")
 
diff --git a/dali/core/convert_test.cc b/dali/core/convert_test.cc
index dd187a80279..2c96055e0df 100644
--- a/dali/core/convert_test.cc
+++ b/dali/core/convert_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ namespace dali {
 
 TEST(ConvertSat, float2int) {
   for (int exp = -10; exp < 100; exp++) {
-    for (float sig = -256; sig <= 256; sig++) {
+    for (int sig = -256; sig <= 256; sig++) {
       float f = ldexpf(sig, exp);
       float integral;
       float fract = modff(f, &integral);
diff --git a/dali/core/mm/cuda_vm_resource_test.cc b/dali/core/mm/cuda_vm_resource_test.cc
index 4ce1fbecad2..a8f84f44701 100644
--- a/dali/core/mm/cuda_vm_resource_test.cc
+++ b/dali/core/mm/cuda_vm_resource_test.cc
@@ -218,10 +218,10 @@ class VMResourceTest : public ::testing::Test {
     cuda_vm_resource res(-1, block_size, va_size);
     size_t size1 = 4*block_size + (4<<20);  // ends past 4 blocks
     size_t size2 = 2*block_size - (8<<20);  // ends before 6 blocks
-    size_t size3 = size1;
     size_t size4 = (4<<20);  // after allocating 1, 2 and 4, the allocation should end at 6th block
     void *p1 = res.allocate(size1);
     void *p2 = res.allocate(size2);
+    (void)p2;
     res.deallocate(p1, size1);
     auto &region = res.va_regions_[0];
     EXPECT_EQ(region.available_blocks, 4);
@@ -244,6 +244,7 @@ class VMResourceTest : public ::testing::Test {
     EXPECT_EQ(region.available.find(true), region.available.ssize());
     EXPECT_EQ(region.available.find(false), 0);
     void *p4 = res.allocate(size4);
+    (void)p4;
     EXPECT_EQ(region.available_blocks, 0);
     EXPECT_EQ(region.mapped.find(true), 0);
     EXPECT_EQ(region.mapped.find(false), 6);
diff --git a/dali/kernels/audio/mel_scale/mel_filter_bank_cpu.cc b/dali/kernels/audio/mel_scale/mel_filter_bank_cpu.cc
index de9d6cde2b5..81aa2ee9832 100644
--- a/dali/kernels/audio/mel_scale/mel_filter_bank_cpu.cc
+++ b/dali/kernels/audio/mel_scale/mel_filter_bank_cpu.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -67,12 +67,16 @@ class MelFilterBankCpu<T>::Impl: public MelFilterImplBase<T> {
       double freq = mel_scale.mel_to_hz(mel);
       interval_ends_[interval] = std::ceil(freq / hz_step_);
     }
+    std::string str = "Hello, world!\n";
+    std::vector<std::string> messages;
+    messages.emplace_back(std::move(str));
+    std::cout << str;
   }
 
 
   /**
-   * @brief Applies mel filter bank to a 2D spectrogram, optimized for 
-   *        frequency-major layout ("ft"). 
+   * @brief Applies mel filter bank to a 2D spectrogram, optimized for
+   *        frequency-major layout ("ft").
    */
   void ComputeFreqMajor(T* out, const T* in, int64_t nwindows,
                         int64_t out_size, int64_t out_stride,
diff --git a/dali/kernels/common/scatter_gather.cu b/dali/kernels/common/scatter_gather.cu
index 451229d7a5d..878ca608b22 100644
--- a/dali/kernels/common/scatter_gather.cu
+++ b/dali/kernels/common/scatter_gather.cu
@@ -76,6 +76,8 @@ ScatterGatherBase::BlockCountAndSize(const std::vector<CopyRange> &ranges) const
 
   size_t size_per_block = std::min(max_size, max_size_per_block_);
 
+  DALI_ENFORCE(size_per_block > 0, "Non-empty set of ranges needs to be provided");
+
   size_t num_blocks = 0;
   for (auto &r : ranges)
     num_blocks += div_ceil(r.size, size_per_block);
diff --git a/dali/kernels/signal/dct/dct_gpu.cu b/dali/kernels/signal/dct/dct_gpu.cu
index 6db1e4c3d5e..fd68aa0095a 100644
--- a/dali/kernels/signal/dct/dct_gpu.cu
+++ b/dali/kernels/signal/dct/dct_gpu.cu
@@ -193,6 +193,7 @@ DLL_PUBLIC void Dct1DGpu<OutputType, InputType>::Run(KernelContext &ctx,
 
   int i = 0;
   for (auto &table_entry : cos_tables_) {
+    // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign)
     auto cpu_table = cpu_cos_table[i % 2];
     auto &buffer_event = buffer_events_[i % 2];
     int n;
diff --git a/dali/kernels/signal/window/extract_windows_gpu.cuh b/dali/kernels/signal/window/extract_windows_gpu.cuh
index c0ec370401b..a84384148e5 100644
--- a/dali/kernels/signal/window/extract_windows_gpu.cuh
+++ b/dali/kernels/signal/window/extract_windows_gpu.cuh
@@ -532,6 +532,7 @@ struct ExtractHorizontalWindowsImplGPU : ExtractWindowsImplGPU<Dst, Src> {
         out_shape.set_tensor_shape(i, { nwin, out_win_length });
       }
     }
+    DALI_ENFORCE(max_win_per_input > 0, "All inputs result in no windows to be extracted.");
 
     if (concatenate) {
       out_shape.set_tensor_shape(0, { total_windows, out_win_length });
diff --git a/include/dali/core/dev_string.h b/include/dali/core/dev_string.h
index ad85a89f000..601efe7d377 100644
--- a/include/dali/core/dev_string.h
+++ b/include/dali/core/dev_string.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ struct DeviceString {
 
   __device__ void clear() {
     if (length_) {
+      // NOLINTNEXTLINE
       free(data_);
       data_ = nullptr;
       length_ = 0;
diff --git a/include/dali/core/mm/cuda_vm_resource.h b/include/dali/core/mm/cuda_vm_resource.h
index 92955693c49..36ee6b18e84 100644
--- a/include/dali/core/mm/cuda_vm_resource.h
+++ b/include/dali/core/mm/cuda_vm_resource.h
@@ -593,7 +593,6 @@ class cuda_vm_resource : public memory_resource<memory_kind::device> {
     auto *region = va_find(ptr);
     assert(region);
     CUdeviceptr region_start  = region->address_range.ptr();
-    CUdeviceptr region_end    = region->address_range.end();
     ptrdiff_t offset = dptr - region_start;
     int start_block_idx = offset / block_size_;
     int end_block_idx = (offset + size + block_size_ - 1) / block_size_;
@@ -607,7 +606,6 @@ class cuda_vm_resource : public memory_resource<memory_kind::device> {
       int next_unmapped = region->mapped.find(false, block_idx);
       if (next_unmapped >= end_block_idx)
         break;  // everything we need is mapped
-      block_idx = next_unmapped;
       int next_mapped = region->mapped.find(true, next_unmapped + 1);
       int next = std::min(end_block_idx, next_mapped);
       blocks_to_map += next - next_unmapped;