From aee7d2525c8ff9239361e1a193b1002dda1558de Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 21 Jan 2020 14:59:23 +1100
Subject: [PATCH 01/59] Create addCO2.py

---
 ocean_dp/processing/addCO2.py | 90 +++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100755 ocean_dp/processing/addCO2.py

diff --git a/ocean_dp/processing/addCO2.py b/ocean_dp/processing/addCO2.py
new file mode 100755
index 0000000..6f095be
--- /dev/null
+++ b/ocean_dp/processing/addCO2.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 21 14:58:20 2020
+
+@author: tru050
+"""
+
+
+
+
+from netCDF4 import Dataset
+import sys
+import gsw
+import numpy as np
+from datetime import datetime
+import pandas
+
+# addCO2 takes a SOTS FV02 gridded product netCDFfile as an input, and adds 
+# CO2 data (delivered from NOAA in a csv file) to the netCDFfile
+def addCO2(netCDFfile):
+    
+    # Import the SOTS netcdf
+    ds = Dataset(netCDFfile, 'a')
+    
+    # Extract the time variable, in serial date numbers (days since 01/01/1950)
+    var_time = ds.variables["TIME"]
+
+    # Convert the variable object to an array
+    netcdf_serials = np.array(var_time[:])
+    
+    # Read in the CO2 csv file, ignoring the first five rows
+    dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019.csv',header=5)
+    
+    # Convert the dataframe to an array
+    dc = dcsv.to_numpy()
+    
+    csv_dates = []
+    
+    # Create a list of datetimes from the csv
+    for i in range(len(dc)):
+    
+        csv_dates.append(datetime.strptime(dc[i,0],'%m/%d/%Y %H:%M'))
+        
+    # Calculate the difference between the csv dates and 01/01/1950 in order
+    # to convert them to the serial date format of the netcdf
+    time_offset_1950 = datetime(1950,1,1,0,0,0)
+    
+    csv_delta= []
+    
+    for i in range(len(dc)):
+    
+        csv_delta.append(csv_dates[i] - time_offset_1950)
+        
+    
+    # Convert the datetimes from the csv into an array of serial date numbers
+    csv_serials = []
+    
+    for i in range(len(dc)):
+    
+        csv_serials.append(csv_delta[i].days + csv_delta[i].seconds/86400)
+        
+    csv_serials = np.array(csv_serials)
+    
+    # Find the indices of timestamps of the csv file that are in the deployment
+    # period of the netcdf file
+    matching_index = (netcdf_serials[0] <= csv_serials) &  (csv_serials <= netcdf_serials[-1])
+    
+    new_vars = ['XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP']
+    
+    # For each of the variables in the csv file (except time), linearly 
+    # interpolate to the timestamps of the netcdf file    
+    for i in range(0,len(new_vars)):
+    
+        np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64'))
+        
+        ncVarOut = ds.createVariable(new_vars[i], "f4", ("TIME",), fill_value=np.nan, zlib=True)
+        
+        
+        
+    # update the history attribute
+    try:
+        hist = ds.history + "\n"
+    except AttributeError:
+        hist = ""
+
+    ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added ")
+
+    ds.close()
+        
\ No newline at end of file

From fea30ed80bd728b79bfe25c470515ac63db5dc11 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 21 Jan 2020 16:17:41 +1100
Subject: [PATCH 02/59] Update addCO2.py

---
 ocean_dp/processing/addCO2.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ocean_dp/processing/addCO2.py b/ocean_dp/processing/addCO2.py
index 6f095be..8a14af8 100755
--- a/ocean_dp/processing/addCO2.py
+++ b/ocean_dp/processing/addCO2.py
@@ -30,7 +30,7 @@ def addCO2(netCDFfile):
     netcdf_serials = np.array(var_time[:])
     
     # Read in the CO2 csv file, ignoring the first five rows
-    dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019.csv',header=5)
+    dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019test.csv',header=5)
     
     # Convert the dataframe to an array
     dc = dcsv.to_numpy()
@@ -68,15 +68,19 @@ def addCO2(netCDFfile):
     
     new_vars = ['XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP']
     
+    new_units = ['kPa','umol/mol','umol/mol','Presumed PSU - not specified','deg C']
+    
     # For each of the variables in the csv file (except time), linearly 
     # interpolate to the timestamps of the netcdf file    
     for i in range(0,len(new_vars)):
-    
-        np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64'))
         
         ncVarOut = ds.createVariable(new_vars[i], "f4", ("TIME",), fill_value=np.nan, zlib=True)
         
+        ncVarOut[:] =  np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64'))
+        
+        ncVarOut.units = new_units[i]
         
+        ncVarOut.comment = "imported from 'SOFS_prelimdata_Nov2019.csv'"
         
     # update the history attribute
     try:
@@ -84,7 +88,7 @@ def addCO2(netCDFfile):
     except AttributeError:
         hist = ""
 
-    ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added ")
+    ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added 'XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP' from 'SOFS_prelimdata_Nov2019.csv'")
 
     ds.close()
         
\ No newline at end of file

From 71470a317c942331e61977bad5f2e30b93759b73 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 22 Jan 2020 09:55:49 +1100
Subject: [PATCH 03/59] Update copyDataset.py

---
 ocean_dp/aggregation/copyDataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py
index 0079d13..154757f 100644
--- a/ocean_dp/aggregation/copyDataset.py
+++ b/ocean_dp/aggregation/copyDataset.py
@@ -244,8 +244,8 @@ def aggregate(files, varNames):
 
     filen = 0
 
-    # variables we want regardless
-    varNames += ['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH']
+    # variables we want regardless 
+    varNames = [varNames]+['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH']
 
     # remove any duplicates
     varNamesOut = set(varNames)

From 7145c3ebd80702ece428d2d6d07afc53c166371b Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 22 Jan 2020 15:08:31 +1100
Subject: [PATCH 04/59] Create add_interp_press.m

---
 matlab/add_interp_press.m | 125 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 matlab/add_interp_press.m

diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m
new file mode 100644
index 0000000..6b139ce
--- /dev/null
+++ b/matlab/add_interp_press.m
@@ -0,0 +1,125 @@
+% SOTS Pressure interpolator
+
+% This code imports pressure data from an aggregated file (constructed by
+% P.Jansen), and creates interpolated pressure records for FV00 raw
+% instrument files - firstly by interpolating along time series of pressure
+% readings in the aggregate file to find pressures at each time in a
+% particular FV00 file, and secondly by interpolating down nominal depths at
+% each timestamp to find a pressure value for each FV00 timestamp.
+
+% Ben Weeding - ben.weeding.26@gmail.com
+
+%% Load the filenames
+
+fv00_files = dir('*FV00*.nc');
+agg_files = dir('*Aggregate*.nc');
+
+%% Load the pressure data
+
+agg_pres = ncread(agg_files.name,'PRES');
+agg_pres_info = ncinfo(agg_files.name, 'PRES');
+agg_instrument_index = ncread(agg_files.name,'instrument_index');
+agg_nominal_depth  = ncread(agg_files.name,'NOMINAL_DEPTH');
+agg_time = ncread(agg_files.name,'TIME');
+
+%% Create a scattered interpolant from the aggregate data
+
+% This was an error, as it interpolated in 2D space rather than twice in 1D
+
+
+% Subsampled every 10 points for speed of execution at this point
+%scat_interp_pres = scatteredInterpolant(agg_time(1:10:end),agg_nominal_depth(agg_instrument_index(1:10:end)+1),agg_pres(1:10:end));
+
+%% Interpolate the pressure and write the data into the FV00 file
+
+% Loop through each of the fv00 files
+for i=1:length(fv00_files)
+    
+    disp(fv00_files(i).name)
+    
+    % Extract the content from the FV00 file
+    fv00_contents = ncinfo(fv00_files(i).name);
+    
+    % Check if the FV00 file contains pressure data, run the interpolation
+    % code if not
+    
+    if (sum(contains({fv00_contents.Variables(:).Name}, 'PRES')) == 0)
+        
+        % Load the FV00 data requiring pressure        
+        %'days since 1950-01-01 00:00:00 UTC' for minilog T        
+        fv00_time = ncread(fv00_files(i).name,'TIME');        
+        fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH');
+  
+        % Interpolate the agg pressure records at each nominal depth to
+        % provide pressure values at each timestamp in the current FV00
+        % file
+        
+        interp_agg_pres = nan(length(agg_nominal_depth),length(fv00_time));
+        
+        % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time       
+        for j = 1:length(agg_nominal_depth)
+            
+            % Select the relevant time and pressures
+            
+            time_selection = agg_time(agg_instrument_index == (j-1));            
+            pres_selection = agg_pres(agg_instrument_index == (j-1));
+                        
+            % Interpolate along each nominal depth
+
+            interp_agg_pres(j,:) = interp1(time_selection,pres_selection,fv00_time);
+        end
+        
+        % At each timestamp in the FV00 record, interpolate a pressure
+        % value based on the FV00 nominal depth, and the interpolated
+        % pressures in interp_agg_pres. Using spline interpolation here to 
+        % deal with NaN values that occur in failed pressure sensors.
+        pres_interp_dummy = nan(size(fv00_time));      
+        
+        for l = 1:length(fv00_time)               
+            
+            pres_interp_dummy(l) = interp1(agg_nominal_depth,interp_agg_pres(:,l),fv00_depth,'spline');                
+        
+        end            
+        
+        pres_interp = pres_interp_dummy;
+        
+        % Create an FV01 version of the current FV00 file
+        
+        % Create the new FV01 file name
+        
+        fv01_name = strrep(fv00_files(i).name,'FV00','FV01');        
+        fv01_name(end-10:end-3)=datestr(now,'yyyymmdd');
+        
+        % Write the FV00 data into the FV01 file        
+        ncwriteschema(fv01_name, fv00_contents);
+        
+        % copy variable data to new file
+        for v = fv00_contents.Variables
+            ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name));
+        end
+        
+        % Modify the global attributes of the file to record processing,
+        % and add to the file history
+        
+        ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed');
+        hist = ncreadatt(fv00_files(i).name, '/', 'history');
+        ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Added interpolated pressure from ' agg_files.name]);
+        
+        % Add and populate a PRES variable to the FV01 file
+        nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(pres_interp,1)}, 'FillValue',NaN);
+        ncwrite(fv01_name, 'PRES', pres_interp);
+                
+        % copy attributes from agg file to output file
+        pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file
+        for k=1:length(pres_atts)
+            if (strcmp(pres_atts(k).Name, '_FillValue') == 0)
+                ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value);
+            end
+        end
+        
+        % Add the relevant attributes to the PRES variable, including a
+        % comment noting that the data has been linearly interpolated        
+        ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been linearly interpolated from surrounding pressure sensors');
+        
+    end    
+end
\ No newline at end of file

From 8d81ba9c925d5f6fda53d3e14bb1161cefd6ba21 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 30 Jan 2020 12:28:58 +1100
Subject: [PATCH 05/59] Update add_interp_press.m

---
 matlab/add_interp_press.m | 158 ++++++++++++++++++++++++++++++++++----
 1 file changed, 142 insertions(+), 16 deletions(-)

diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m
index 6b139ce..5e5ff5a 100644
--- a/matlab/add_interp_press.m
+++ b/matlab/add_interp_press.m
@@ -22,13 +22,16 @@
 agg_nominal_depth  = ncread(agg_files.name,'NOMINAL_DEPTH');
 agg_time = ncread(agg_files.name,'TIME');
 
-%% Create a scattered interpolant from the aggregate data
-
-% This was an error, as it interpolated in 2D space rather than twice in 1D
+% Here we prevent the use of bad data from Pulse 8
 
+if strfind(fv00_files(1).name,'Pulse-8')
+             
+    agg_pres(agg_instrument_index==2 & agg_time+datenum(1950,1,1,0,0,0) >= datenum('30-01-2012 05:00','dd-mm-yyyy HH:MM'))=NaN;
+   
+    %agg_pres(agg_instrument_index==2)=NaN;
+    
+end
 
-% Subsampled every 10 points for speed of execution at this point
-%scat_interp_pres = scatteredInterpolant(agg_time(1:10:end),agg_nominal_depth(agg_instrument_index(1:10:end)+1),agg_pres(1:10:end));
 
 %% Interpolate the pressure and write the data into the FV00 file
 
@@ -54,10 +57,16 @@
         % provide pressure values at each timestamp in the current FV00
         % file
         
-        interp_agg_pres = nan(length(agg_nominal_depth),length(fv00_time));
+        interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time));
+        
+        % Include a row of zeros to set surface depth as 0 dbar
+        
+        interp_agg_pres(1,:) = zeros(size(fv00_time));
+        
+        agg_nominal_depth_with_0 = [0; agg_nominal_depth];
         
         % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time       
-        for j = 1:length(agg_nominal_depth)
+        for j = 1:(length(agg_nominal_depth))
             
             % Select the relevant time and pressures
             
@@ -66,20 +75,36 @@
                         
             % Interpolate along each nominal depth
 
-            interp_agg_pres(j,:) = interp1(time_selection,pres_selection,fv00_time);
-        end
+            interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time);
+        end   
+        
+        % Sort the nominal depths and pressures
+        
+        [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0);
+        
+        interp_agg_pres = interp_agg_pres(sort_idx,:);
+        
+        
+        % Linearly interpolate at each timestamp to replace NaN values
+        
+        interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0);
+        
         
         % At each timestamp in the FV00 record, interpolate a pressure
         % value based on the FV00 nominal depth, and the interpolated
-        % pressures in interp_agg_pres. Using spline interpolation here to 
-        % deal with NaN values that occur in failed pressure sensors.
-        pres_interp_dummy = nan(size(fv00_time));      
+        % pressures in interp_agg_pres. 
+        pres_interp_dummy = nan(size(fv00_time));     
         
-        for l = 1:length(fv00_time)               
+       
+        for l = 1:length(fv00_time) 
+            
+            if sum(~isnan(interp_agg_pres(:,l))) > 1
             
-            pres_interp_dummy(l) = interp1(agg_nominal_depth,interp_agg_pres(:,l),fv00_depth,'spline');                
+                pres_interp_dummy(l) = interp1(agg_nominal_depth_with_0,interp_agg_pres(:,l),fv00_depth);                
         
-        end            
+            end
+            
+        end        
         
         pres_interp = pres_interp_dummy;
         
@@ -119,7 +144,108 @@
         
         % Add the relevant attributes to the PRES variable, including a
         % comment noting that the data has been linearly interpolated        
-        ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been linearly interpolated from surrounding pressure sensors');
+        ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been interpolated from surrounding pressure sensors');
+        
+    else
+        
+        % Load the FV00 data containing pressure        
+        %'days since 1950-01-01 00:00:00 UTC' for minilog T        
+        fv00_time = ncread(fv00_files(i).name,'TIME');        
+        fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH');
+        fv00_pres = ncread(fv00_files(i).name,'PRES');
+        
+        % Remove bad data in pulse 8
+        
+        if strfind(fv00_files(i).name,'Pulse-8-2011-SBE16plusV2-01606330-34m')
+            
+            fv00_pres(4442:end) = NaN;
+            
+        end
+  
+        % Interpolate the agg pressure records at each nominal depth to
+        % provide pressure values at each timestamp in the current FV00
+        % file
+        
+        interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time));
+        
+        % Include a row of zeros to set surface depth as 0 dbar
+        
+        interp_agg_pres(1,:) = zeros(size(fv00_time));
+        
+        agg_nominal_depth_with_0 = [0; agg_nominal_depth];
+        
+        % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time       
+        for j = 1:(length(agg_nominal_depth))
+            
+            % Select the relevant time and pressures
+            
+            time_selection = agg_time(agg_instrument_index == (j-1));            
+            pres_selection = agg_pres(agg_instrument_index == (j-1));
+                        
+            % Interpolate along each nominal depth
+
+            interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time);
+        end   
+        
+        % Sort the nominal depths and pressures
+        
+        [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0);
+        
+        interp_agg_pres = interp_agg_pres(sort_idx,:);
+        
+        
+        % Linearly interpolate at each timestamp to replace NaN values
+        
+        interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0);
+        
+        for j = 1:length(fv00_pres)
+            
+            if isnan(fv00_pres(j))
+                
+                fv00_pres(j) = interp_agg_pres(agg_nominal_depth_with_0==fv00_depth,j);
+                
+            end
+            
+        end
+        
+        % Create an FV01 version of the current FV00 file
+        
+        % Create the new FV01 file name
+        
+        fv01_name = strrep(fv00_files(i).name,'FV00','FV01');        
+        fv01_name(end-10:end-3)=datestr(now,'yyyymmdd');
+        
+        % Write the FV00 data into the FV01 file        
+        ncwriteschema(fv01_name, fv00_contents);
+        
+        % copy variable data to new file
+        for v = fv00_contents.Variables
+            ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name));
+        end
+        
+        % Modify the global attributes of the file to record processing,
+        % and add to the file history
+        
+        ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed');
+        hist = ncreadatt(fv00_files(i).name, '/', 'history');
+        ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Filled missing pressure with interpolated pressure from ' agg_files.name]);
+        
+        % Add and populate a PRES variable to the FV01 file
+        %nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(fv00_pres,1)}, 'FillValue',NaN);
+        ncwrite(fv01_name, 'PRES', fv00_pres);
+                
+        % copy attributes from agg file to output file
+        pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file
+        for k=1:length(pres_atts)
+            if (strcmp(pres_atts(k).Name, '_FillValue') == 0)
+                ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value);
+            end
+        end
+        
+        % Add the relevant attributes to the PRES variable, including a
+        % comment noting that the data has been linearly interpolated        
+        ncwriteatt(fv01_name, 'PRES', 'comment','originally missing pressure data has been interpolated from surrounding pressure sensors');
+        
         
     end    
 end
\ No newline at end of file

From 3e0f9a960df28430a7f0670ccfbd00fc3cb0d776 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 5 Feb 2020 14:45:48 +1100
Subject: [PATCH 06/59] Create pressure_interpolator.py

---
 ocean_dp/processing/pressure_interpolator.py | 115 +++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100755 ocean_dp/processing/pressure_interpolator.py

diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py
new file mode 100755
index 0000000..311d03d
--- /dev/null
+++ b/ocean_dp/processing/pressure_interpolator.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb  4 11:05:16 2020
+
+@author: tru050
+"""
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import numpy as np
+import sys
+from netCDF4 import Dataset
+import numpy
+import argparse
+import glob
+import pandas as pd
+import scipy
+
+def pressure_interpolator:
+    
+    # Load the filenames of the FV00 files in the current folder
+    fv00_files = glob.glob('*FV00*.nc');
+
+    # Extract the aggregate file data
+    agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r")
+    
+    # Loop through each of the FV00 files
+    for i in fv00_files:
+        
+        # Extract the contents of the current file
+        fv00_contents = Dataset(i, mode="r")
+        
+        # Check the current file doesn't contain pressure to run the following
+        # interpolator
+        if not 'PRES' in fv00_contents.variables:
+            
+            # Create a NaN array to fill with pressure values
+            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan)
+            
+            # Set the first row as zeros to set 0m as 0dbar
+            interp_agg_pres[0,:] = 0
+            
+            # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), 
+            # only for interpolation in cases where the deepest sensor has failed
+            #interp_agg_pres[-1,:] = 5000
+            
+            # Create a new array representing the nominal depths of the agg file,
+            # including the 0m values
+            agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0)
+            
+            # For each nominal depth, interpolate the agg data at the FV00 times
+            for j in range(1,len(agg_nominal_depths)):
+                
+                time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)]
+                
+                pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)]
+                
+                interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection)
+                                           
+            # Sort the nominal depths and pressures according to nominal depth
+            interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:]
+        
+            agg_nominal_depths.sort()
+            
+            # If there are any NaN values, linearly interpolate profilewise
+            if np.isnan(np.sum(interp_agg_pres)):
+                
+                # Make a dataframe of the interpolated pressure to handle NaNs easily
+                interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths)
+                
+                # Find all the columns where the lowest element is NaN
+                nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist()
+                
+                # Select each column containing an NaN as the deepest value
+                for j in nan_cols:
+                    
+                    # Find the shallowest nominal depth that isn't NaN
+                    shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j])
+                    
+                    # Find the index of that nominal depth
+                    shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val)
+                    
+                    # Starting at the shallowest NaN in a continous block of NaNs to the bottom
+                    for k in range(shallowest_idx+1,len(interp_agg_pres_df)):
+                        
+                        # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar
+                        interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1]
+                        
+                # Linearly interpolate any remaining NaNs
+                interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index")
+                
+                # Convert the DataFrame back to an array
+                interp_agg_pres =  interp_agg_pres_df.to_numpy()
+            
+            # Create a NaN array to receive the FV00 interpolated pressures
+            interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan)
+            
+            # At each timestamp, interpolate pressure for the FV00 data
+            for j in range(len(fv00_contents.variables["TIME"])):
+            
+                interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j])
+                    
+            
+                    
+                    
+                
+                
+                
+                
+                
+                

From dde748f5b90ff75b8f6874584a07e03b294fd678 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 5 Feb 2020 14:45:56 +1100
Subject: [PATCH 07/59] Update add_interp_press.m

---
 matlab/add_interp_press.m | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m
index 5e5ff5a..5ba9817 100644
--- a/matlab/add_interp_press.m
+++ b/matlab/add_interp_press.m
@@ -134,6 +134,30 @@
         nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(pres_interp,1)}, 'FillValue',NaN);
         ncwrite(fv01_name, 'PRES', pres_interp);
                 
+        % Add quality control variables to the FV01 file, assigning 8 to
+        % interpolated data in line with Argo
+        for v = fv00_contents.Variables
+            
+            if ~isempty(v.Dimensions)
+            
+                nccreate(fv01_name, v.Name + "_quality_control",'Dimensions',{v.Dimensions.Name,v.Dimensions.Length},'FillValue',99);
+                
+                ncwriteatt(fv01_name,v.Name + "_quality_control",'long_name',"quality_code for"+v.Name);
+                
+                ncwriteatt(fv01_name,v.Name,'ancillary_variables',v.Name + "_quality_control");
+                
+                if contains(v.Name,'PRES')
+                    
+                    ncwrite(fv01_name, v.Name + "_quality_control",8*ones(size(fv00_time)));
+                    
+                end
+                
+            end
+            
+        end
+        
+            
+            
         % copy attributes from agg file to output file
         pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file
         for k=1:length(pres_atts)

From b7afd52237c021633a0992603e9057ae2f6a930c Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 5 Feb 2020 14:46:00 +1100
Subject: [PATCH 08/59] Update copyDataset.py

---
 ocean_dp/aggregation/copyDataset.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py
index 154757f..d23fe7d 100644
--- a/ocean_dp/aggregation/copyDataset.py
+++ b/ocean_dp/aggregation/copyDataset.py
@@ -15,7 +15,6 @@
 
 # similar more general tool project https://ncagg.readthedocs.io/en/latest/ (does not work on python3 2019-10-01)
 # has configurable way of dealing with attributes
-
 # file sets to test against
 # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Temperature/catalog.html
 # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html
@@ -344,7 +343,7 @@ def aggregate(files, varNames):
                         dMin = maVariableAll.max(0)
                         ncOut.setncattr("geospatial_vertical_max", dMax)
                         ncOut.setncattr("geospatial_vertical_min", dMin)
-
+                        
     dsIn.close()  # we're done with the varList now
 
     ncOut.close()

From a9c85a9d413dccd7ddfb651efab7ce1a68656d1f Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 5 Feb 2020 14:46:02 +1100
Subject: [PATCH 09/59] Create flatline_test.py

---
 ocean_dp/qc/flatline_test.py | 153 +++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100755 ocean_dp/qc/flatline_test.py

diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
new file mode 100755
index 0000000..a4e4ec5
--- /dev/null
+++ b/ocean_dp/qc/flatline_test.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb  3 14:10:41 2020
+
+@author: tru050
+"""
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+#!/usr/bin/python3
+
+# add_qc_flags
+# Copyright (C) 2020 Peter Jansen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# add QC variables to file
+
+
+def add_qc(netCDFfile):
+
+    new_name = [] # list of new file names
+
+    # loop over all file names given
+    for fn in netCDFfile[1:]:
+        ds = Dataset(fn, 'a')
+
+        # read the variable names from the netCDF dataset
+        vars = ds.variables
+
+        # create a list of variables, don't include the 'TIME' variable
+        # TODO: detect 'TIME' variable using the standard name 'time'
+        to_add = []
+        for v in vars:
+            #print (vars[v].dimensions)
+            if v != 'TIME':
+                to_add.append(v)
+
+        # for each variable, add a new ancillary variable <VAR>_quality_control to each which has 'TIME' as a dimension
+        for v in to_add:
+            if "TIME" in vars[v].dimensions:
+                # print("time dim ", v)
+
+                ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=99 otherwise defaults to max, imos-toolbox uses 99
+                ncVarOut[:] = np.zeros(vars[v].shape)
+                ncVarOut.long_name = "quality_code for " + v
+
+                vars[v].ancillary_variables = v + "_quality_control"
+
+        # update the file version attribute
+        ds.file_version = "Level 1 - Quality Controlled Data"
+
+        ds.close()
+
+        # rename the file FV00 to FV01 (imos specific)
+        fn_new = fn.replace("FV00", "FV01")
+        new_name.append(fn_new)
+
+        if fn_new != fn:
+            # copy file
+            os.copy(fn, fn_new)
+
+        print(fn_new)
+
+    return new_name
+
+
+if __name__ == "__main__":
+    add_qc(sys.argv)
+
+##############################################################################
+
+def flatline_test(*target_files,target_vars=[],window=3):
+    
+    # If files aren't specified, take all the .nc files in the current folder
+    if not target_files:
+        
+        target_files = glob.glob('*.nc')
+    
+    # Loop through each files in target_files
+    for current_file in target_files:
+        
+        
+        # Print each filename
+        print("input file %s" % current_file)
+        
+        # Extract netcdf data into nc
+        nc = Dataset(current_file, mode="r")
+        
+        # Extract time
+        ncTime = nc.get_variables_by_attributes(standard_name='time')
+    
+        # If target_vars aren't user specified, set it to all the variables of 
+        # the current_file, removing TIME
+        if target_vars == []:
+            
+            target_vars = list(nc.variables.keys())
+            
+            target_vars.remove('TIME')
+            
+        # Check if file contains quality control variables, and if not create
+        
+        if not any("_quality_control" in i for i in target_vars:
+                   
+                   # insert _quality_control variables into file? 
+                   # should this be done now, or should we assume it
+                   # will have already been done?
+                
+            
+        # For each variable, extract the data 
+        for current_var in target_vars:
+            
+            var_data = np.array(nc.variables[current_var])
+            
+            for i in 0:(len(var_data)-window+1):
+                
+                    # This is true if 'window' elements in a row are equal
+                if len(set(var_data[i:(i+window)])) == 1
+                    
+                    # set corresponding QC value to...
+                    
+    
+            
+            
+            
+            
+    
+    
+    
+    
\ No newline at end of file

From 1193875bda18d662cef5ca5bccf42f48f65032f1 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 12 Feb 2020 16:03:17 +1100
Subject: [PATCH 10/59] Selects deployed data

---
 ocean_dp/processing/pressure_interpolator.py |  75 +++++++++++-
 ocean_dp/qc/add_qc_flags.py                  |  61 +++++++---
 ocean_dp/qc/select_in_water.py               | 114 +++++++++++++++++++
 3 files changed, 227 insertions(+), 23 deletions(-)
 create mode 100755 ocean_dp/qc/select_in_water.py

diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py
index 311d03d..cb929af 100755
--- a/ocean_dp/processing/pressure_interpolator.py
+++ b/ocean_dp/processing/pressure_interpolator.py
@@ -19,6 +19,8 @@
 import glob
 import pandas as pd
 import scipy
+import os
+import shutil
 
 def pressure_interpolator:
     
@@ -44,10 +46,6 @@ def pressure_interpolator:
             # Set the first row as zeros to set 0m as 0dbar
             interp_agg_pres[0,:] = 0
             
-            # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), 
-            # only for interpolation in cases where the deepest sensor has failed
-            #interp_agg_pres[-1,:] = 5000
-            
             # Create a new array representing the nominal depths of the agg file,
             # including the 0m values
             agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0)
@@ -103,9 +101,76 @@ def pressure_interpolator:
             for j in range(len(fv00_contents.variables["TIME"])):
             
                 interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j])
-                    
+        
+        # Use methods from add_qc_flags to make a new netcdf?
+        
+        # Deal with files that already contain pressure, but may contain NaNs            
+        else:
+             # Create a NaN array to fill with pressure values
+            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan)
+            
+            # Set the first row as zeros to set 0m as 0dbar
+            interp_agg_pres[0,:] = 0
+            
+            # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), 
+            # only for interpolation in cases where the deepest sensor has failed
+            #interp_agg_pres[-1,:] = 5000
+            
+            # Create a new array representing the nominal depths of the agg file,
+            # including the 0m values
+            agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0)
+            
+            # For each nominal depth, interpolate the agg data at the FV00 times
+            for j in range(1,len(agg_nominal_depths)):
+                
+                time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)]
+                
+                pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)]
+                
+                interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection)
+                                           
+            # Sort the nominal depths and pressures according to nominal depth
+            interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:]
+        
+            agg_nominal_depths.sort()
             
+            # If there are any NaN values, linearly interpolate profilewise
+            if np.isnan(np.sum(interp_agg_pres)):
+                
+                # Make a dataframe of the interpolated pressure to handle NaNs easily
+                interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths)
+                
+                # Find all the columns where the lowest element is NaN
+                nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist()
+                
+                # Select each column containing an NaN as the deepest value
+                for j in nan_cols:
+                    
+                    # Find the shallowest nominal depth that isn't NaN
+                    shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j])
+                    
+                    # Find the index of that nominal depth
+                    shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val)
                     
+                    # Starting at the shallowest NaN in a continous block of NaNs to the bottom
+                    for k in range(shallowest_idx+1,len(interp_agg_pres_df)):
+                        
+                        # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar
+                        interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1]
+                        
+                # Linearly interpolate any remaining NaNs
+                interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index")
+                
+                # Convert the DataFrame back to an array
+                interp_agg_pres =  interp_agg_pres_df.to_numpy()
+            
+            # Create a NaN array to receive the FV00 interpolated pressures
+            interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan)
+            
+            # Extract the interpolated pressures (NaNs removed) to store in netCDF4
+            interp_fv00_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv00_contents.variables["NOMINAL_DEPTH"][:]]
+            
+        #
                     
                 
                 
diff --git a/ocean_dp/qc/add_qc_flags.py b/ocean_dp/qc/add_qc_flags.py
index 7d00c63..bd68234 100644
--- a/ocean_dp/qc/add_qc_flags.py
+++ b/ocean_dp/qc/add_qc_flags.py
@@ -18,11 +18,12 @@
 
 from netCDF4 import Dataset, num2date
 import sys
-
+from datetime import datetime
 import numpy as np
 from dateutil import parser
 import pytz
 import os
+import shutil
 
 # add QC variables to file
 
@@ -32,8 +33,31 @@ def add_qc(netCDFfile):
     new_name = [] # list of new file names
 
     # loop over all file names given
-    for fn in netCDFfile[1:]:
-        ds = Dataset(fn, 'a')
+    for fn in netCDFfile:
+
+        # rename the file FV00 to FV01 (imos specific)
+        fn_new = fn.replace("FV00", "FV01")
+        
+        # Change the creation date in the filename to today
+        now=datetime.utcnow()
+        
+
+        
+        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))
+        
+        # Add the new file name to the list of new file names
+        new_name.append(fn_new)
+
+        # If a new (different) filename has been successfully generated, make 
+        # a copy of the old file with the new name
+        if fn_new != fn:
+            # copy file
+            shutil.copy(fn, fn_new)
+              
+
+        print(fn_new)
+
+        ds = Dataset(fn_new, 'a')
 
         # read the variable names from the netCDF dataset
         vars = ds.variables
@@ -51,29 +75,30 @@ def add_qc(netCDFfile):
             if "TIME" in vars[v].dimensions:
                 # print("time dim ", v)
 
-                ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=99 otherwise defaults to max, imos-toolbox uses 99
-                ncVarOut[:] = np.zeros(vars[v].shape)
-                ncVarOut.long_name = "quality_code for " + v
+                if v+"_quality_control" not in ds.variables:
+                    ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=99 otherwise defaults to max, imos-toolbox uses 99
+                    ncVarOut[:] = np.zeros(vars[v].shape)
+                    ncVarOut.long_name = "quality_code for " + v
+                    ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9])
+                    ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+                    
 
-                vars[v].ancillary_variables = v + "_quality_control"
+                    vars[v].ancillary_variables = v + "_quality_control"
 
-        # update the file version attribute
+        # update the global attributes
         ds.file_version = "Level 1 - Quality Controlled Data"
+        
+        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")
+        
+        ds.history += ' ' + now.strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.'
 
-        ds.close()
-
-        # rename the file FV00 to FV01 (imos specific)
-        fn_new = fn.replace("FV00", "FV01")
-        new_name.append(fn_new)
+        # ADD quality control attributes!!
 
-        if fn_new != fn:
-            # copy file
-            os.copy(fn, fn_new)
+        ds.close()
 
-        print(fn_new)
 
     return new_name
 
 
 if __name__ == "__main__":
-    add_qc(sys.argv)
+    add_qc(sys.argv[1:])
\ No newline at end of file
diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
new file mode 100755
index 0000000..b600695
--- /dev/null
+++ b/ocean_dp/qc/select_in_water.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 12 09:49:26 2020
+
+@author: tru050
+"""
+from dateutil.parser import parse
+from netCDF4 import Dataset, num2date, date2num
+from datetime import datetime, timedelta
+import sys
+from datetime import datetime
+import numpy as np
+from dateutil import parser
+import pytz
+import os
+import shutil
+
+def select_in_water(netCDFfile):
+    
+    new_name = [] # list of new file names
+
+    # loop over all file names given
+    for fn in netCDFfile:
+        
+        # Change the creation date in the filename to today
+        now=datetime.utcnow()
+        
+        fn_new = fn.replace("FV00", "FV01")
+        
+        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))
+        
+        # Add the new file name to the list of new file names
+        new_name.append(fn_new)
+            
+        # Load the original netcdf file
+        ods = Dataset(fn,'a')
+        
+        # Extract the time dimension, and the deployment start and end        
+        time = np.array(ods.variables['TIME'][:])
+        
+        inw = parse(ods.time_deployment_start)
+
+        outw = parse(ods.time_deployment_end)
+        
+        # Convert the start and end to the number format used in TIME
+        inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+        
+        outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+        
+        # Create logical index of deployed times
+        
+        deployed = np.logical_and(time>=inw_num,time<=outw_num)
+        
+        # Determine the length of the new time dimension
+        
+        time_dim = len(time[deployed])
+        
+        # Create the new netcdf file
+        ds = Dataset(fn_new, "w", format="NETCDF4")
+        
+        TIME = ds.createDimension("TIME",time_dim)
+        
+        # Copy global attributes
+        
+        for att in ods.ncattrs():
+            
+            ds.setncattr(att,ods.getncattr(att))
+        
+        # Copy variables
+            
+        for v_name, varin in ods.variables.items():
+            
+            varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
+    
+            # Copy variable attributes
+            varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
+    
+            # Fill variables with deployed data
+            
+            if np.array(varin[:]).size == 1:
+                
+                varout[:] = varin[:]
+                
+            else:
+                
+                varout[:] = np.array(varin[:])[deployed]
+                
+        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")
+        
+        ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
+        
+        ds.close()
+        
+        ods.close()
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+            
+            
+            
+            
\ No newline at end of file

From 774f7dabd15ba1cbdff4e26fe72bb4b3a02fc18a Mon Sep 17 00:00:00 2001
From: Peter Jansen <petejan@users.noreply.github.com>
Date: Wed, 12 Feb 2020 16:30:46 +1100
Subject: [PATCH 11/59] Update select_in_water.py

minor clean ups, should look at using split for file name manipulation.
---
 ocean_dp/qc/select_in_water.py | 85 ++++++++++++++++------------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index b600695..f23df57 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -1,10 +1,21 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Feb 12 09:49:26 2020
+#!/usr/bin/python3
+
+# ocean_dp
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-@author: tru050
-"""
 from dateutil.parser import parse
 from netCDF4 import Dataset, num2date, date2num
 from datetime import datetime, timedelta
@@ -16,19 +27,19 @@
 import os
 import shutil
 
-def select_in_water(netCDFfile):
+
+def select_in_water(netCDFfiles):
     
     new_name = [] # list of new file names
 
     # loop over all file names given
-    for fn in netCDFfile:
+    for fn in netCDFfiles:
         
         # Change the creation date in the filename to today
         now=datetime.utcnow()
         
-        fn_new = fn.replace("FV00", "FV01")
-        
-        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))
+        fn_new = fn.replace("FV00", "FV01")        
+        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))  # might be better to use split("-") here, and manybe even a check for IMOS file name
         
         # Add the new file name to the list of new file names
         new_name.append(fn_new)
@@ -40,20 +51,16 @@ def select_in_water(netCDFfile):
         time = np.array(ods.variables['TIME'][:])
         
         inw = parse(ods.time_deployment_start)
-
         outw = parse(ods.time_deployment_end)
         
         # Convert the start and end to the number format used in TIME
         inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-        
         outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
         
-        # Create logical index of deployed times
-        
+        # Create logical index of deployed times        
         deployed = np.logical_and(time>=inw_num,time<=outw_num)
         
-        # Determine the length of the new time dimension
-        
+        # Determine the length of the new time dimension        
         time_dim = len(time[deployed])
         
         # Create the new netcdf file
@@ -61,14 +68,12 @@ def select_in_water(netCDFfile):
         
         TIME = ds.createDimension("TIME",time_dim)
         
-        # Copy global attributes
-        
+        # Copy global attributes        
         for att in ods.ncattrs():
             
             ds.setncattr(att,ods.getncattr(att))
         
-        # Copy variables
-            
+        # Copy variables            
         for v_name, varin in ods.variables.items():
             
             varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
@@ -76,8 +81,7 @@ def select_in_water(netCDFfile):
             # Copy variable attributes
             varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
     
-            # Fill variables with deployed data
-            
+            # Fill variables with deployed data            
             if np.array(varin[:]).size == 1:
                 
                 varout[:] = varin[:]
@@ -86,29 +90,18 @@ def select_in_water(netCDFfile):
                 
                 varout[:] = np.array(varin[:])[deployed]
                 
-        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")
+        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
         
-        ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
-        
-        ds.close()
+        # update the history attribute
+        try:
+            hist = ds.history + "\n"
+        except AttributeError:
+            hist = ""        
+        ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
         
+        ds.close()       
         ods.close()
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-            
-            
-            
-            
\ No newline at end of file
+
+
+if __name__ == "__main__":
+    select_in_water(sys.argv[1:]) 

From 40e89b2f800d0fd7c5d2241b78140316fdf9133f Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 13 Feb 2020 15:39:04 +1100
Subject: [PATCH 12/59] Update select_in_water.py

---
 ocean_dp/qc/select_in_water.py | 122 ++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 57 deletions(-)

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index f23df57..6d5b7fb 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -35,72 +35,80 @@ def select_in_water(netCDFfiles):
     # loop over all file names given
     for fn in netCDFfiles:
         
-        # Change the creation date in the filename to today
-        now=datetime.utcnow()
+        # Check the file is an IMOS formatted file
+        if fn.split('_')[0]=='IMOS'
         
-        fn_new = fn.replace("FV00", "FV01")        
-        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))  # might be better to use split("-") here, and manybe even a check for IMOS file name
-        
-        # Add the new file name to the list of new file names
-        new_name.append(fn_new)
+            # Change the creation date in the filename to today
+            now=datetime.utcnow()
             
-        # Load the original netcdf file
-        ods = Dataset(fn,'a')
-        
-        # Extract the time dimension, and the deployment start and end        
-        time = np.array(ods.variables['TIME'][:])
-        
-        inw = parse(ods.time_deployment_start)
-        outw = parse(ods.time_deployment_end)
-        
-        # Convert the start and end to the number format used in TIME
-        inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-        outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-        
-        # Create logical index of deployed times        
-        deployed = np.logical_and(time>=inw_num,time<=outw_num)
-        
-        # Determine the length of the new time dimension        
-        time_dim = len(time[deployed])
-        
-        # Create the new netcdf file
-        ds = Dataset(fn_new, "w", format="NETCDF4")
-        
-        TIME = ds.createDimension("TIME",time_dim)
-        
-        # Copy global attributes        
-        for att in ods.ncattrs():
+            fn_new = fn.replace("FV00", "FV01")
             
-            ds.setncattr(att,ods.getncattr(att))
-        
-        # Copy variables            
-        for v_name, varin in ods.variables.items():
+            fn_new_split = fn_new.split('_')
             
-            varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
-    
-            # Copy variable attributes
-            varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
-    
-            # Fill variables with deployed data            
-            if np.array(varin[:]).size == 1:
-                
-                varout[:] = varin[:]
+            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d")
+            
+            fn_new = '_'.join(fn_new_split)
+            
+            # Add the new file name to the list of new file names
+            new_name.append(fn_new)
                 
-            else:
+            # Load the original netcdf file
+            ods = Dataset(fn,'a')
+            
+            # Extract the time dimension, and the deployment start and end        
+            time = np.array(ods.variables['TIME'][:])
+            
+            inw = parse(ods.time_deployment_start)
+            outw = parse(ods.time_deployment_end)
+            
+            # Convert the start and end to the number format used in TIME
+            inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+            outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+            
+            # Create logical index of deployed times        
+            deployed = np.logical_and(time>=inw_num,time<=outw_num)
+            
+            # Determine the length of the new time dimension        
+            time_dim = len(time[deployed])
+            
+            # Create the new netcdf file
+            ds = Dataset(fn_new, "w", format="NETCDF4")
+            
+            TIME = ds.createDimension("TIME",time_dim)
+            
+            # Copy global attributes        
+            for att in ods.ncattrs():
                 
-                varout[:] = np.array(varin[:])[deployed]
+                ds.setncattr(att,ods.getncattr(att))
+            
+            # Copy variables            
+            for v_name, varin in ods.variables.items():
                 
-        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
+                varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
         
-        # update the history attribute
-        try:
-            hist = ds.history + "\n"
-        except AttributeError:
-            hist = ""        
-        ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
+                # Copy variable attributes
+                varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
         
-        ds.close()       
-        ods.close()
+                # Fill variables with deployed data            
+                if np.array(varin[:]).size == 1:
+                    
+                    varout[:] = varin[:]
+                    
+                else:
+                    
+                    varout[:] = np.array(varin[:])[deployed]
+                    
+            ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
+            
+            # update the history attribute
+            try:
+                hist = ds.history + "\n"
+            except AttributeError:
+                hist = ""        
+            ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
+            
+            ds.close()       
+            ods.close()
 
 
 if __name__ == "__main__":

From 1e2f36b5818d4f6c97e4c15af5c28baa667a452d Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 13 Feb 2020 15:39:28 +1100
Subject: [PATCH 13/59] Update in_water with split and IMOS test

---
 ocean_dp/qc/select_in_water.py | 171 ++++++++++++++++-----------------
 ocean_dp/qc/spike_test         |   8 ++
 2 files changed, 93 insertions(+), 86 deletions(-)
 create mode 100755 ocean_dp/qc/spike_test

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index b600695..6925d12 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -1,10 +1,19 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Feb 12 09:49:26 2020
+# ocean_dp
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-@author: tru050
-"""
 from dateutil.parser import parse
 from netCDF4 import Dataset, num2date, date2num
 from datetime import datetime, timedelta
@@ -16,99 +25,89 @@
 import os
 import shutil
 
-def select_in_water(netCDFfile):
+
+def select_in_water(netCDFfiles):
     
     new_name = [] # list of new file names
 
     # loop over all file names given
-    for fn in netCDFfile:
-        
-        # Change the creation date in the filename to today
-        now=datetime.utcnow()
+    for fn in netCDFfiles:
         
-        fn_new = fn.replace("FV00", "FV01")
+        # Check the file is an IMOS formatted file
+        if fn.split('_')[0]=='IMOS'
         
-        fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::]))
-        
-        # Add the new file name to the list of new file names
-        new_name.append(fn_new)
+            # Change the creation date in the filename to today
+            now=datetime.utcnow()
             
-        # Load the original netcdf file
-        ods = Dataset(fn,'a')
-        
-        # Extract the time dimension, and the deployment start and end        
-        time = np.array(ods.variables['TIME'][:])
-        
-        inw = parse(ods.time_deployment_start)
-
-        outw = parse(ods.time_deployment_end)
-        
-        # Convert the start and end to the number format used in TIME
-        inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-        
-        outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-        
-        # Create logical index of deployed times
-        
-        deployed = np.logical_and(time>=inw_num,time<=outw_num)
-        
-        # Determine the length of the new time dimension
-        
-        time_dim = len(time[deployed])
-        
-        # Create the new netcdf file
-        ds = Dataset(fn_new, "w", format="NETCDF4")
-        
-        TIME = ds.createDimension("TIME",time_dim)
-        
-        # Copy global attributes
-        
-        for att in ods.ncattrs():
+            fn_new = fn.replace("FV00", "FV01")
             
-            ds.setncattr(att,ods.getncattr(att))
-        
-        # Copy variables
+            fn_new_split = fn_new.split('_')
             
-        for v_name, varin in ods.variables.items():
+            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d")
             
-            varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
-    
-            # Copy variable attributes
-            varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
-    
-            # Fill variables with deployed data
+            fn_new = '_'.join(fn_new_split)
             
-            if np.array(varin[:]).size == 1:
+            # Add the new file name to the list of new file names
+            new_name.append(fn_new)
                 
-                varout[:] = varin[:]
-                
-            else:
-                
-                varout[:] = np.array(varin[:])[deployed]
+            # Load the original netcdf file
+            ods = Dataset(fn,'a')
+            
+            # Extract the time dimension, and the deployment start and end        
+            time = np.array(ods.variables['TIME'][:])
+            
+            inw = parse(ods.time_deployment_start)
+            outw = parse(ods.time_deployment_end)
+            
+            # Convert the start and end to the number format used in TIME
+            inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+            outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
+            
+            # Create logical index of deployed times        
+            deployed = np.logical_and(time>=inw_num,time<=outw_num)
+            
+            # Determine the length of the new time dimension        
+            time_dim = len(time[deployed])
+            
+            # Create the new netcdf file
+            ds = Dataset(fn_new, "w", format="NETCDF4")
+            
+            TIME = ds.createDimension("TIME",time_dim)
+            
+            # Copy global attributes        
+            for att in ods.ncattrs():
                 
-        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")
-        
-        ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
-        
-        ds.close()
-        
-        ods.close()
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
+                ds.setncattr(att,ods.getncattr(att))
             
+            # Copy variables            
+            for v_name, varin in ods.variables.items():
+                
+                varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
+        
+                # Copy variable attributes
+                varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
+        
+                # Fill variables with deployed data            
+                if np.array(varin[:]).size == 1:
+                    
+                    varout[:] = varin[:]
+                    
+                else:
+                    
+                    varout[:] = np.array(varin[:])[deployed]
+                    
+            ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
             
+            # update the history attribute
+            try:
+                hist = ds.history + "\n"
+            except AttributeError:
+                hist = ""        
+            ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
             
-            
\ No newline at end of file
+            ds.close()       
+            ods.close()
+
+
+if __name__ == "__main__":
+    select_in_water(sys.argv[1:]) 
\ No newline at end of file
diff --git a/ocean_dp/qc/spike_test b/ocean_dp/qc/spike_test
new file mode 100755
index 0000000..0bb13ea
--- /dev/null
+++ b/ocean_dp/qc/spike_test
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 12 16:28:38 2020
+
+@author: tru050
+"""
+

From af33590364f1b1d6a98e90e75e43f2cae9b79378 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 14 Feb 2020 14:41:11 +1100
Subject: [PATCH 14/59] Create netcdf_gen.py

---
 ocean_dp/qc/netcdf_gen.py | 118 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100755 ocean_dp/qc/netcdf_gen.py

diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
new file mode 100755
index 0000000..f687f83
--- /dev/null
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from dateutil.parser import parse
+from netCDF4 import Dataset, num2date, date2num
+from datetime import datetime, timedelta
+import sys
+from datetime import datetime
+import numpy as np
+from dateutil import parser
+import pytz
+import os
+import shutil
+
+# Provide the function with a filename (don't include .nc), a nominal depth,
+# and pairs of names and arrays containing the data to be included as variables.
+# A time dimension/variable is created by default, starting at 01/01/2020 using 
+# 1 hour timestamps
+
+# For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
+
+def netcdf_gen(file_name,nominal_depth,*args):
+    
+    # Convert the args tuple to a list
+    args = list(args)
+    
+    # Check the args are paired
+    if len(args) % 2 == 0:
+        
+        # Assign the names and data to lists
+        var_names = args[0::2]
+        
+        var_data = args[1::2]    
+        
+        # Check if first of each pair is a string
+        if all(isinstance(x, str) for x in var_names):
+        
+            # Check if second of each pair are all equal in shape
+            if all(np.shape(var_data[1]) == np.shape(x) for x in var_data):
+                
+                # Create the netcdf with IMOS tag
+                ds = Dataset("IMOS_" + file_name + ".nc","w", format="NETCDF4")
+                
+                # Create time dimension with length to match data
+                time_dim = ds.createDimension("TIME", len(var_data[0]))
+                
+                time_var = ds.createVariable("TIME","f8",("TIME"))
+                
+                ds.variables['TIME'][:] = np.arange(25567,25567+(1/24)*len(var_data[1]),1/24)
+                
+                time_atts = ['long_name','time','units','days since 1950-01-01 00:00:00 UTC',
+                 'calendar','gregorian','axis','T','standard_name','time','valid_max',
+                 90000,'valid_min',0]  
+                
+                for att_name,att_value in zip(time_atts[0::2],time_atts[1::2]):
+                    
+                    time_var.setncattr(att_name,att_value)
+                
+                # Create the nominal depth variable
+                nom_depth_var = ds.createVariable("NOMINAL_DEPTH","f8")
+                
+                ds.variables["NOMINAL_DEPTH"] = nominal_depth
+                
+                nom_dep_atts = ['long_name','nominal depth','units','m',
+                 'positive','down','axis','Z','standard_name','depth','valid_max',
+                 12000,'valid_min',-5,'reference_datum','sea surface'] 
+                
+                for att_name,att_value in zip(nom_dep_atts[0::2],nom_dep_atts[1::2]):
+                    
+                    nom_depth_var.setncattr(att_name,att_value)
+                
+                # Create variables from input data
+                for name_in,data_in in zip(var_names,var_data):
+                    
+                    ds.createVariable(name_in,"f8",("TIME"))
+                    
+                    ds.variables[name_in][:] = data_in
+                    
+                ds.close()
+            
+            else:
+                print('Data arrays not of equal length')
+            
+        
+        else:
+            print('Labels not in string format')
+        
+    else:
+        print('Data not passed in pairs')
+        
+        
+
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+

From c61b8ba5ddd223ef3d32db43683a327e92a4b57a Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 17 Feb 2020 16:19:55 +1100
Subject: [PATCH 15/59] Update select_in_water.py

---
 ocean_dp/qc/select_in_water.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index 02ccb49..68f57be 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -27,6 +27,7 @@
 import os
 import shutil
 
+# Submit argument as a list
 
 def select_in_water(netCDFfiles):
     
@@ -36,7 +37,7 @@ def select_in_water(netCDFfiles):
     for fn in netCDFfiles:
         
         # Check the file is an IMOS formatted file
-        if fn.split('_')[0]=='IMOS'
+        if fn.split('_')[0]=='IMOS':
         
             # Change the creation date in the filename to today
             now=datetime.utcnow()
@@ -45,7 +46,7 @@ def select_in_water(netCDFfiles):
             
             fn_new_split = fn_new.split('_')
             
-            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d")
+            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc"
             
             fn_new = '_'.join(fn_new_split)
             
@@ -100,6 +101,12 @@ def select_in_water(netCDFfiles):
                     
             ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
             
+            # update the time coverage attributes
+            
+            ds.time_coverage_start = ods.time_deployment_start
+            
+            ds.time_coverage_end = ods.time_deployment_end
+            
             # update the history attribute
             try:
                 hist = ds.history + "\n"

From 04badbfce75d0df9f21055b11854ae33516d81cd Mon Sep 17 00:00:00 2001
From: Peter Jansen <petejan@users.noreply.github.com>
Date: Tue, 18 Feb 2020 09:31:04 +1100
Subject: [PATCH 16/59] Update select_in_water.py

Remove some white space,
able to operate on non IMOS file names,
TODO: find TIME dimension (or the dimension of the TIME variable)
---
 ocean_dp/qc/select_in_water.py | 148 ++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 76 deletions(-)

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index 68f57be..5118ae8 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -32,91 +32,87 @@
 def select_in_water(netCDFfiles):
     
     new_name = [] # list of new file names
+    now = datetime.utcnow()            
 
     # loop over all file names given
     for fn in netCDFfiles:
         
         # Check the file is an IMOS formatted file
-        if fn.split('_')[0]=='IMOS':
-        
+        if fn.split('_')[0]=='IMOS':        
+            fn_new = fn_new.replace("FV00", "FV01")           
+            fn_new_split = fn_new.split('_')            
             # Change the creation date in the filename to today
-            now=datetime.utcnow()
-            
-            fn_new = fn.replace("FV00", "FV01")
-            
-            fn_new_split = fn_new.split('_')
-            
-            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc"
-            
+            fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc"            
             fn_new = '_'.join(fn_new_split)
+        else:
+            fn_new = fn.replace(".nc", "-trim.nc")
             
-            # Add the new file name to the list of new file names
-            new_name.append(fn_new)
-                
-            # Load the original netcdf file
-            ods = Dataset(fn,'a')
-            
-            # Extract the time dimension, and the deployment start and end        
-            time = np.array(ods.variables['TIME'][:])
-            
-            inw = parse(ods.time_deployment_start)
-            outw = parse(ods.time_deployment_end)
-            
-            # Convert the start and end to the number format used in TIME
-            inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-            outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units)
-            
-            # Create logical index of deployed times        
-            deployed = np.logical_and(time>=inw_num,time<=outw_num)
-            
-            # Determine the length of the new time dimension        
-            time_dim = len(time[deployed])
-            
-            # Create the new netcdf file
-            ds = Dataset(fn_new, "w", format="NETCDF4")
-            
-            TIME = ds.createDimension("TIME",time_dim)
-            
-            # Copy global attributes        
-            for att in ods.ncattrs():
-                
-                ds.setncattr(att,ods.getncattr(att))
-            
-            # Copy variables            
-            for v_name, varin in ods.variables.items():
-                
-                varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
-        
-                # Copy variable attributes
-                varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
+        # Add the new file name to the list of new file names
+        new_name.append(fn_new)
+
+        # Load the original netcdf file
+        ods = Dataset(fn,'a')
+
+        # Extract the time dimension, and the deployment start and end    
+        # TODO: check this works
+        v = nc.get_variables_by_attributes(standard_name='time')
+        time = np.array(v[0][:])
+
+        inw = parse(ods.time_deployment_start)
+        outw = parse(ods.time_deployment_end)
+
+        # Convert the start and end to the number format used in TIME
+        inw_num = date2num(inw.replace(tzinfo=None), units=ods.variables['TIME'].units)
+        outw_num = date2num(outw.replace(tzinfo=None), units=ods.variables['TIME'].units)
+
+        # Create logical index of deployed times        
+        deployed = np.logical_and(time>=inw_num, time<=outw_num)
+
+        # Determine the length of the new time dimension        
+        time_dim_len = len(time[deployed])
+
+        # Create the new netcdf file
+        ds = Dataset(fn_new, "w", format="NETCDF4")
+
+        new_time_dim = ds.createDimension("TIME", time_dim_len)
+
+        # Copy global attributes        
+        for att in ods.ncattrs():
+            ds.setncattr(att, ods.getncattr(att))
+
+        # Copy variables            
+        for v_name, varin in ods.variables.items():
+
+            varout = ds.createVariable(v_name, varin.datatype, varin.dimensions)
+
+            # Copy variable attributes
+            varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})
+
+            # Fill variables with deployed data            
+            # TODO: should check if the dimensions for the variable include TIME, and truncate that dimension
+            if np.array(varin[:]).size == 1:
+                varout[:] = varin[:]
+            else:
+                varout[:] = np.array(varin[:])[deployed]
+
+        ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
+
+        # update the time coverage attributes            
+        ds.time_coverage_start = ods.time_deployment_start            
+        ds.time_coverage_end = ods.time_deployment_end
+
+        # update the history attribute
+        try:
+            hist = ds.history + "\n"
+        except AttributeError:
+            hist = ""        
+        ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
+
+        ds.close()       
+        ods.close()
         
-                # Fill variables with deployed data            
-                if np.array(varin[:]).size == 1:
-                    
-                    varout[:] = varin[:]
-                    
-                else:
-                    
-                    varout[:] = np.array(varin[:])[deployed]
-                    
-            ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ")     
-            
-            # update the time coverage attributes
-            
-            ds.time_coverage_start = ods.time_deployment_start
-            
-            ds.time_coverage_end = ods.time_deployment_end
-            
-            # update the history attribute
-            try:
-                hist = ds.history + "\n"
-            except AttributeError:
-                hist = ""        
-            ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.'        
-            
-            ds.close()       
-            ods.close()
+    return new_name
 
 
 if __name__ == "__main__":
-    select_in_water(sys.argv[1:]) 
\ No newline at end of file
+    select_in_water(sys.argv[1:]) 

From e5559ca5e31d0dd90edcd2b1748c6523eac34976 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 19 Feb 2020 11:25:52 +1100
Subject: [PATCH 17/59] Update pressure_interpolator.py

---
 ocean_dp/processing/pressure_interpolator.py | 184 ++++++++++++-------
 1 file changed, 122 insertions(+), 62 deletions(-)

diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py
index cb929af..ed764c3 100755
--- a/ocean_dp/processing/pressure_interpolator.py
+++ b/ocean_dp/processing/pressure_interpolator.py
@@ -22,26 +22,54 @@
 import os
 import shutil
 
-def pressure_interpolator:
-    
-    # Load the filenames of the FV00 files in the current folder
-    fv00_files = glob.glob('*FV00*.nc');
+# Supply netCDFfiles as a ['list'] of files, agg as a 'string'
 
-    # Extract the aggregate file data
-    agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r")
+def pressure_interpolator(netCDFfiles = None,agg = None):
+    
+    if netCDFfiles==None:
     
-    # Loop through each of the FV00 files
-    for i in fv00_files:
+        # Load the filenames of the fv01 files in the current folder
+        netCDFfiles = glob.glob('*FV01*.nc')
+            
+    if agg ==None:
         
-        # Extract the contents of the current file
-        fv00_contents = Dataset(i, mode="r")
+    # Extract the aggregate file data
+        agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r")
+        
+    else:
+        
+        agg = Dataset(glob.glob(agg)[0], mode="r")
+        
+    # Loop through each of the fv01 files
+    for fn in netCDFfiles:
+        
+        # Change the creation date in the filename to today
+        now=datetime.utcnow()
+            
+        fn_new_split = fn.split('_')
+            
+        fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc"
+        
+        fn_new_split[2] += 'IP'
+            
+        fn_new = '_'.join(fn_new_split)
+
+        
+        # If a new (different) filename has been successfully generated, make 
+        # a copy of the old file with the new name
+        if fn_new != fn:
+            # copy file
+            shutil.copy(fn, fn_new)
+        
+        # Open and work in the new copy
+        fv01_contents = Dataset(fn_new,mode='a')
         
         # Check the current file doesn't contain pressure to run the following
         # interpolator
-        if not 'PRES' in fv00_contents.variables:
+        if not 'PRES' in fv01_contents.variables:
             
             # Create a NaN array to fill with pressure values
-            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan)
+            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan)
             
             # Set the first row as zeros to set 0m as 0dbar
             interp_agg_pres[0,:] = 0
@@ -50,14 +78,14 @@ def pressure_interpolator:
             # including the 0m values
             agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0)
             
-            # For each nominal depth, interpolate the agg data at the FV00 times
+            # For each nominal depth, interpolate the agg data at the fv01 times
             for j in range(1,len(agg_nominal_depths)):
                 
                 time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)]
                 
                 pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)]
                 
-                interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection)
+                interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection)
                                            
             # Sort the nominal depths and pressures according to nominal depth
             interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:]
@@ -71,7 +99,7 @@ def pressure_interpolator:
                 interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths)
                 
                 # Find all the columns where the lowest element is NaN
-                nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist()
+                nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna())
                 
                 # Select each column containing an NaN as the deepest value
                 for j in nan_cols:
@@ -94,85 +122,117 @@ def pressure_interpolator:
                 # Convert the DataFrame back to an array
                 interp_agg_pres =  interp_agg_pres_df.to_numpy()
             
-            # Create a NaN array to receive the FV00 interpolated pressures
-            interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan)
+            # Create a NaN array to receive the fv01 interpolated pressures
+            interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan)
             
-            # At each timestamp, interpolate pressure for the FV00 data
-            for j in range(len(fv00_contents.variables["TIME"])):
+            # At each timestamp, interpolate pressure for the fv01 data
+            for j in range(len(fv01_contents.variables["TIME"])):
             
-                interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j])
+                interp_fv01_pres[j] = np.interp(fv01_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j])
         
-        # Use methods from add_qc_flags to make a new netcdf?
+            # Create the PRES and PRES_quality_control variables, and their attributes
+                
+            pres_var = fv01_contents.createVariable('PRES','f8',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True)
+            
+            pres_atts = ['long_name','sea_water_pressure_due_to_sea_water','units','dbar',
+            'standard_name','coordinates','TIME LATITUDE LONGITUDE NOMINAL_DEPTH','sea_water_pressure_due_to_sea_water','valid_max',
+            12000,'valid_min',-15] 
+                
+            for att_name,att_value in zip(pres_atts[0::2],pres_atts[1::2]):
+                    
+                pres_var.setncattr(att_name,att_value)
+                        
+            pres_var[:] = interp_fv01_pres
+            
+            
+            pres_qc_var = fv01_contents.createVariable('PRES_quality_control','i1',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True)
+            
+            pres_qc_var.long_name = "quality_code for PRES"
+            
+            pres_qc_var.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9])
+            
+            pres_qc_var.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+                    
+            pres_qc_var[:] = 7
+            
+            pres_var.ancillary_variables = "PRES_quality_control"
+            
+            # Close the netcdf files
+            
+            fv01_contents.close()
+            
+            agg.close()
         
-        # Deal with files that already contain pressure, but may contain NaNs            
-        else:
-             # Create a NaN array to fill with pressure values
-            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan)
+        # Deal with files that already contain pressure, but contain NaNs            
+        elif any(np.isnan(agg.variables['PRES'][:])):
+            
+            # Create a NaN array to fill with pressure values
+            interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan)
             
             # Set the first row as zeros to set 0m as 0dbar
             interp_agg_pres[0,:] = 0
             
-            # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), 
-            # only for interpolation in cases where the deepest sensor has failed
-            #interp_agg_pres[-1,:] = 5000
-            
             # Create a new array representing the nominal depths of the agg file,
             # including the 0m values
             agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0)
             
-            # For each nominal depth, interpolate the agg data at the FV00 times
+            # For each nominal depth, interpolate the agg data at the fv01 times
             for j in range(1,len(agg_nominal_depths)):
                 
                 time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)]
                 
                 pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)]
                 
-                interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection)
+                interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection)
                                            
             # Sort the nominal depths and pressures according to nominal depth
             interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:]
         
             agg_nominal_depths.sort()
+                
+            # Make a dataframe of the interpolated pressure to handle NaNs easily
+            interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths)
             
-            # If there are any NaN values, linearly interpolate profilewise
-            if np.isnan(np.sum(interp_agg_pres)):
+            # Find all the columns where the lowest element is NaN
+            nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna())
+            
+            # Select each column containing an NaN as the deepest value
+            for j in nan_cols:
                 
-                # Make a dataframe of the interpolated pressure to handle NaNs easily
-                interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths)
+                # Find the shallowest nominal depth that isn't NaN
+                shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j])
                 
-                # Find all the columns where the lowest element is NaN
-                nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist()
+                # Find the index of that nominal depth
+                shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val)
                 
-                # Select each column containing an NaN as the deepest value
-                for j in nan_cols:
+                # Starting at the shallowest NaN in a continous block of NaNs to the bottom
+                for k in range(shallowest_idx+1,len(interp_agg_pres_df)):
                     
-                    # Find the shallowest nominal depth that isn't NaN
-                    shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j])
-                    
-                    # Find the index of that nominal depth
-                    shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val)
+                    # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar
+                    interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1]
                     
-                    # Starting at the shallowest NaN in a continous block of NaNs to the bottom
-                    for k in range(shallowest_idx+1,len(interp_agg_pres_df)):
-                        
-                        # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar
-                        interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1]
-                        
-                # Linearly interpolate any remaining NaNs
-                interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index")
-                
-                # Convert the DataFrame back to an array
-                interp_agg_pres =  interp_agg_pres_df.to_numpy()
-            
-            # Create a NaN array to receive the FV00 interpolated pressures
-            interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan)
+            # Linearly interpolate any remaining NaNs
+            interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index")
             
+            # Convert the DataFrame back to an array
+            interp_agg_pres =  interp_agg_pres_df.to_numpy()
+        
+            # Create a NaN array to receive the fv01 interpolated pressures
+            interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan)
+        
             # Extract the interpolated pressures (NaNs removed) to store in netCDF4
-            interp_fv00_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv00_contents.variables["NOMINAL_DEPTH"][:]]
+            interp_fv01_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv01_contents.variables["NOMINAL_DEPTH"][:]]
+        
+            # Find indices where the netcdf data and interpolated data don't match (where the NaNs are in the netcdf)
+            nan_rep_idx = np.where(interp_fv01_pres!=fv01_contents.variables['PRES'][:])[1]
+            
+            fv01_contents.variables['PRES_quality_control'][nan_rep_idx] = 7
+        
+            # Insert pressure value with NaNs interpolated back into netcdf
+            fv01_contents.variables['PRES'][:] = interp_fv01_pres
+            
+            fv01_contents.close()
             
-        #
-                    
-                
                 
                 
                 

From 500830bb2173269f0e6bc5d27d39317f3d4584a9 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 19 Feb 2020 15:40:05 +1100
Subject: [PATCH 18/59] Update pressure_interpolator.py

---
 ocean_dp/processing/pressure_interpolator.py | 34 ++++++++++++++++----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py
index ed764c3..af248c2 100755
--- a/ocean_dp/processing/pressure_interpolator.py
+++ b/ocean_dp/processing/pressure_interpolator.py
@@ -24,14 +24,18 @@
 
 # Supply netCDFfiles as a ['list'] of files, agg as a 'string'
 
-def pressure_interpolator(netCDFfiles = None,agg = None):
+def pressure_interpolator(netCDFfiles = [],agg = []):
     
-    if netCDFfiles==None:
+    if netCDFfiles==[]:
+        
+        print('netcdffiles = none')
     
         # Load the filenames of the fv01 files in the current folder
         netCDFfiles = glob.glob('*FV01*.nc')
             
-    if agg ==None:
+    if agg == []:
+        
+        print('agg = none')
         
     # Extract the aggregate file data
         agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r")
@@ -43,6 +47,8 @@ def pressure_interpolator(netCDFfiles = None,agg = None):
     # Loop through each of the fv01 files
     for fn in netCDFfiles:
         
+        print('File selected is '+fn)
+        
         # Change the creation date in the filename to today
         now=datetime.utcnow()
             
@@ -58,16 +64,25 @@ def pressure_interpolator(netCDFfiles = None,agg = None):
         # If a new (different) filename has been successfully generated, make 
         # a copy of the old file with the new name
         if fn_new != fn:
+            
+            print('copying file')
             # copy file
             shutil.copy(fn, fn_new)
         
         # Open and work in the new copy
         fv01_contents = Dataset(fn_new,mode='a')
         
+        print('copied file opened')
+        
         # Check the current file doesn't contain pressure to run the following
         # interpolator
         if not 'PRES' in fv01_contents.variables:
             
+            print("file doesn't contain pressure")
+            
+            print(fv01_contents.variables.keys())
+            print(agg.variables.keys())
+            
             # Create a NaN array to fill with pressure values
             interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan)
             
@@ -160,11 +175,11 @@ def pressure_interpolator(netCDFfiles = None,agg = None):
             # Close the netcdf files
             
             fv01_contents.close()
-            
-            agg.close()
         
         # Deal with files that already contain pressure, but contain NaNs            
-        elif any(np.isnan(agg.variables['PRES'][:])):
+        elif any(np.isnan(np.array(fv01_contents.variables['PRES'][:]))):
+            
+            print("file contains pressure and agg contains NaNs")
             
             # Create a NaN array to fill with pressure values
             interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan)
@@ -226,13 +241,20 @@ def pressure_interpolator(netCDFfiles = None,agg = None):
             # Find indices where the netcdf data and interpolated data don't match (where the NaNs are in the netcdf)
             nan_rep_idx = np.where(interp_fv01_pres!=fv01_contents.variables['PRES'][:])[1]
             
+            #
             fv01_contents.variables['PRES_quality_control'][nan_rep_idx] = 7
         
+            print('QC altered in original press')
+        
             # Insert pressure value with NaNs interpolated back into netcdf
             fv01_contents.variables['PRES'][:] = interp_fv01_pres
             
+            print('press altered in orginal press')
+            
             fv01_contents.close()
             
+    agg.close()
+            
                 
                 
                 

From 959426b9c483ac157c42f4eaf70c61ad437f30d1 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 19 Feb 2020 16:13:13 +1100
Subject: [PATCH 19/59] Update select_in_water.py

---
 ocean_dp/qc/select_in_water.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index 5118ae8..16cf613 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -39,7 +39,7 @@ def select_in_water(netCDFfiles):
         
         # Check the file is an IMOS formatted file
         if fn.split('_')[0]=='IMOS':        
-            fn_new = fn_new.replace("FV00", "FV01")           
+            fn_new = fn.replace("FV00", "FV01")           
             fn_new_split = fn_new.split('_')            
             # Change the creation date in the filename to today
             fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc"            
@@ -55,7 +55,7 @@ def select_in_water(netCDFfiles):
 
         # Extract the time dimension, and the deployment start and end    
         # TODO: check this works
-        v = nc.get_variables_by_attributes(standard_name='time')
+        v = ods.get_variables_by_attributes(standard_name='time')
         time = np.array(v[0][:])
 
         inw = parse(ods.time_deployment_start)

From d8cbcd493728d642d3ba4e07b2bd003a703791e0 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 20 Feb 2020 08:22:39 +1100
Subject: [PATCH 20/59] Update copyDataset.py

---
 ocean_dp/aggregation/copyDataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py
index d23fe7d..95c5998 100644
--- a/ocean_dp/aggregation/copyDataset.py
+++ b/ocean_dp/aggregation/copyDataset.py
@@ -21,7 +21,7 @@
 # http://thredds.aodn.org.au/thredds/catalog/IMOS/ABOS/DA/EAC2000/catalog.html
 
 from dateutil.parser import parse
-
+ 
 
 def aggregate(files, varNames):
     # split this into   createCatalog - copy needed information into structure
@@ -244,7 +244,7 @@ def aggregate(files, varNames):
     filen = 0
 
     # variables we want regardless 
-    varNames = [varNames]+['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH']
+    varNames.extend(['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'])
 
     # remove any duplicates
     varNamesOut = set(varNames)

From 20fe907d30a01acbb916ddb8190e70e9778916b3 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 20 Feb 2020 08:48:17 +1100
Subject: [PATCH 21/59] Update netcdf_gen.py

---
 ocean_dp/qc/netcdf_gen.py | 229 +++++++++++++++++++-------------------
 1 file changed, 112 insertions(+), 117 deletions(-)

diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
index f687f83..d20f998 100755
--- a/ocean_dp/qc/netcdf_gen.py
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -1,118 +1,113 @@
-# Copyright (C) 2020 Ben Weeding
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-from dateutil.parser import parse
-from netCDF4 import Dataset, num2date, date2num
-from datetime import datetime, timedelta
-import sys
-from datetime import datetime
-import numpy as np
-from dateutil import parser
-import pytz
-import os
-import shutil
-
-# Provide the function with a filename (don't include .nc), a nominal depth,
-# and pairs of names and arrays containing the data to be included as variables.
-# A time dimension/variable is created by default, starting at 01/01/2020 using 
-# 1 hour timestamps
-
-# For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
-
-def netcdf_gen(file_name,nominal_depth,*args):
-    
-    # Convert the args tuple to a list
-    args = list(args)
-    
-    # Check the args are paired
-    if len(args) % 2 == 0:
-        
-        # Assign the names and data to lists
-        var_names = args[0::2]
-        
-        var_data = args[1::2]    
-        
-        # Check if first of each pair is a string
-        if all(isinstance(x, str) for x in var_names):
-        
-            # Check if second of each pair are all equal in shape
-            if all(np.shape(var_data[1]) == np.shape(x) for x in var_data):
-                
-                # Create the netcdf with IMOS tag
-                ds = Dataset("IMOS_" + file_name + ".nc","w", format="NETCDF4")
-                
-                # Create time dimension with length to match data
-                time_dim = ds.createDimension("TIME", len(var_data[0]))
-                
-                time_var = ds.createVariable("TIME","f8",("TIME"))
-                
-                ds.variables['TIME'][:] = np.arange(25567,25567+(1/24)*len(var_data[1]),1/24)
-                
-                time_atts = ['long_name','time','units','days since 1950-01-01 00:00:00 UTC',
-                 'calendar','gregorian','axis','T','standard_name','time','valid_max',
-                 90000,'valid_min',0]  
-                
-                for att_name,att_value in zip(time_atts[0::2],time_atts[1::2]):
-                    
-                    time_var.setncattr(att_name,att_value)
-                
-                # Create the nominal depth variable
-                nom_depth_var = ds.createVariable("NOMINAL_DEPTH","f8")
-                
-                ds.variables["NOMINAL_DEPTH"] = nominal_depth
-                
-                nom_dep_atts = ['long_name','nominal depth','units','m',
-                 'positive','down','axis','Z','standard_name','depth','valid_max',
-                 12000,'valid_min',-5,'reference_datum','sea surface'] 
-                
-                for att_name,att_value in zip(nom_dep_atts[0::2],nom_dep_atts[1::2]):
-                    
-                    nom_depth_var.setncattr(att_name,att_value)
-                
-                # Create variables from input data
-                for name_in,data_in in zip(var_names,var_data):
-                    
-                    ds.createVariable(name_in,"f8",("TIME"))
-                    
-                    ds.variables[name_in][:] = data_in
-                    
-                ds.close()
-            
-            else:
-                print('Data arrays not of equal length')
-            
-        
-        else:
-            print('Labels not in string format')
-        
-    else:
-        print('Data not passed in pairs')
-        
-        
-
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
+		 # Copyright (C) 2020 Ben Weeding
+ #
+ # This program is free software: you can redistribute it and/or modify
+ # it under the terms of the GNU General Public License as published by
+ # the Free Software Foundation, either version 3 of the License, or
+ # (at your option) any later version.
+ #
+ # This program is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ # GNU General Public License for more details.
+ #
+ # You should have received a copy of the GNU General Public License
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
+  from netCDF4 import Dataset, date2num
+ import sys
+ from datetime import datetime
+ import numpy as np
+
+  # Provide the function with a filename (don't include .nc), a nominal depth,
+ # and pairs of names and arrays containing the data to be included as variables.
+ # A time dimension/variable is created by default, starting at 01/01/2020 using
+ # 1 hour timestamps
+
+  # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
+ # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN
+
+  def netcdf_gen(file_name, nominal_depth, *args):
+     # Convert the args tuple to a list
+     args = list(args)
+     #print(args, type(args[1]))
+
+      file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these
+
+      # deal with passing nominal depth as a string
+     if isinstance(nominal_depth, str):
+         nominal_depth = float(nominal_depth)
+         print('nominal depth :', nominal_depth)
+
+      # Check the args are paired
+     if len(args) % 2 == 0:
+
+          # Assign the names and data to lists
+         var_names = args[0::2]
+
+          # deal with passing data as a string list of values
+         if isinstance(args[1], str):
+             var_data = [[float(b) for b in a.split(',')] for a in args[1::2]]
+             #print('var_data split', var_data)
+         else:
+             var_data = args[1::2]
+
+          # Check if first of each pair is a string
+         if all(isinstance(x, str) for x in var_names):
+
+              # Check if second of each pair are all equal in shape
+             if all(np.shape(var_data[1]) == np.shape(x) for x in var_data):
+
+                  # Create the netcdf with IMOS tag
+                 ds = Dataset(file_name, "w", format="NETCDF4")
+
+                  # Create time dimension with length to match data
+                 time_dim = ds.createDimension("TIME", len(var_data[0]))
+
+                  time_var = ds.createVariable("TIME", "f8", ("TIME"))
+
+                  time_var.setncattr('long_name', 'time')
+                 time_var.setncattr('standard_name', 'time')
+                 time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC')
+                 time_var.setncattr('calendar', 'gregorian')
+                 time_var.setncattr('axis', 'T')
+                 time_var.setncattr('valid_max', 90000)
+                 time_var.setncattr('valid_min', 0)
+
+                  t0 = date2num(datetime(2020, 1, 1), units=time_var.units)
+                 ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24)
+
+                  # Create the nominal depth variable
+                 nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8")
+                 nom_depth_var.setncattr('long_name', 'nominal depth')
+                 nom_depth_var.setncattr('units', 'dbar')
+                 nom_depth_var.setncattr('positive', 'down')
+                 nom_depth_var.setncattr('axis', 'Z')
+                 nom_depth_var.setncattr('valid_max', 12000)
+                 nom_depth_var.setncattr('valid_min', -5)
+                 nom_depth_var.setncattr('reference_datum', 'sea surface')
+
+                  ds.variables["NOMINAL_DEPTH"][:] = nominal_depth
+
+                  # Create variables from input data
+                 for name_in, data_in in zip(var_names, var_data):
+                     ds.createVariable(name_in, "f8", ("TIME"))
+                     ds.variables[name_in][:] = data_in
+
+                  ds.close()
+                 print("generated ", file_name)
+
+                  return (file_name)
+
+              else:
+                 print('Data arrays not of equal length')
+
+ 
+          else:
+             print('Labels not in string format')
+
+      else:
+         print('Data not passed in pairs')
+
+ 
+  if __name__ == "__main__":
+     netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:])
\ No newline at end of file

From cd6fc8ad08473ebb44a58db05a000f845222521f Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 20 Feb 2020 14:25:14 +1100
Subject: [PATCH 22/59] Update netcdf_gen.py

---
 ocean_dp/qc/netcdf_gen.py | 155 ++++++++++++++++++++------------------
 1 file changed, 81 insertions(+), 74 deletions(-)

diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
index d20f998..7eae159 100755
--- a/ocean_dp/qc/netcdf_gen.py
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -13,101 +13,108 @@
  # You should have received a copy of the GNU General Public License
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-  from netCDF4 import Dataset, date2num
- import sys
- from datetime import datetime
- import numpy as np
+from netCDF4 import Dataset, date2num
+import sys
+from datetime import datetime
+import numpy as np
 
-  # Provide the function with a filename (don't include .nc), a nominal depth,
- # and pairs of names and arrays containing the data to be included as variables.
- # A time dimension/variable is created by default, starting at 01/01/2020 using
- # 1 hour timestamps
+ # Provide the function with a filename (don't include .nc), a nominal depth,
+# and pairs of names and arrays containing the data to be included as variables.
+# A time dimension/variable is created by default, starting at 01/01/2020 using
+# 1 hour timestamps
 
-  # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
- # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN
+ # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
+# from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN
 
-  def netcdf_gen(file_name, nominal_depth, *args):
-     # Convert the args tuple to a list
-     args = list(args)
-     #print(args, type(args[1]))
+def netcdf_gen(file_name, nominal_depth, *args):
+    # Convert the args tuple to a list
+    args = list(args)
+    #print(args, type(args[1]))
 
-      file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these
+    file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these
 
-      # deal with passing nominal depth as a string
-     if isinstance(nominal_depth, str):
-         nominal_depth = float(nominal_depth)
-         print('nominal depth :', nominal_depth)
+     # deal with passing nominal depth as a string
+    if isinstance(nominal_depth, str):
+        nominal_depth = float(nominal_depth)
+        print('nominal depth :', nominal_depth)
 
-      # Check the args are paired
-     if len(args) % 2 == 0:
+     # Check the args are paired
+    if len(args) % 2 == 0:
 
-          # Assign the names and data to lists
-         var_names = args[0::2]
+         # Assign the names and data to lists
+        var_names = args[0::2]
 
-          # deal with passing data as a string list of values
-         if isinstance(args[1], str):
-             var_data = [[float(b) for b in a.split(',')] for a in args[1::2]]
-             #print('var_data split', var_data)
-         else:
-             var_data = args[1::2]
+         # deal with passing data as a string list of values
+        if isinstance(args[1], str):
+            var_data = [[float(b) for b in a.split(',')] for a in args[1::2]]
+            #print('var_data split', var_data)
+        else:
+            var_data = args[1::2]
 
-          # Check if first of each pair is a string
-         if all(isinstance(x, str) for x in var_names):
+         # Check if first of each pair is a string
+        if all(isinstance(x, str) for x in var_names):
 
-              # Check if second of each pair are all equal in shape
-             if all(np.shape(var_data[1]) == np.shape(x) for x in var_data):
+             # Check if second of each pair are all equal in shape
+            if all(np.shape(var_data[0]) == np.shape(x) for x in var_data):
 
-                  # Create the netcdf with IMOS tag
-                 ds = Dataset(file_name, "w", format="NETCDF4")
+                 # Create the netcdf with IMOS tag
+                ds = Dataset(file_name, "w", format="NETCDF4")
 
-                  # Create time dimension with length to match data
-                 time_dim = ds.createDimension("TIME", len(var_data[0]))
+                 # Create time dimension with length to match data
+                time_dim = ds.createDimension("TIME", len(var_data[0]))
 
-                  time_var = ds.createVariable("TIME", "f8", ("TIME"))
+                time_var = ds.createVariable("TIME", "f8", ("TIME"))
 
-                  time_var.setncattr('long_name', 'time')
-                 time_var.setncattr('standard_name', 'time')
-                 time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC')
-                 time_var.setncattr('calendar', 'gregorian')
-                 time_var.setncattr('axis', 'T')
-                 time_var.setncattr('valid_max', 90000)
-                 time_var.setncattr('valid_min', 0)
+                time_var.setncattr('long_name', 'time')
+                time_var.setncattr('standard_name', 'time')
+                time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC')
+                time_var.setncattr('calendar', 'gregorian')
+                time_var.setncattr('axis', 'T')
+                time_var.setncattr('valid_max', 90000)
+                time_var.setncattr('valid_min', 0)
 
-                  t0 = date2num(datetime(2020, 1, 1), units=time_var.units)
-                 ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24)
+                t0 = date2num(datetime(2020, 1, 1), units=time_var.units)
+                ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24)
 
-                  # Create the nominal depth variable
-                 nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8")
-                 nom_depth_var.setncattr('long_name', 'nominal depth')
-                 nom_depth_var.setncattr('units', 'dbar')
-                 nom_depth_var.setncattr('positive', 'down')
-                 nom_depth_var.setncattr('axis', 'Z')
-                 nom_depth_var.setncattr('valid_max', 12000)
-                 nom_depth_var.setncattr('valid_min', -5)
-                 nom_depth_var.setncattr('reference_datum', 'sea surface')
+                 # Create the nominal depth variable
+                nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8")
+                nom_depth_var.setncattr('long_name', 'nominal depth')
+                nom_depth_var.setncattr('units', 'dbar')
+                nom_depth_var.setncattr('positive', 'down')
+                nom_depth_var.setncattr('axis', 'Z')
+                nom_depth_var.setncattr('valid_max', 12000)
+                nom_depth_var.setncattr('valid_min', -5)
+                nom_depth_var.setncattr('reference_datum', 'sea surface')
 
-                  ds.variables["NOMINAL_DEPTH"][:] = nominal_depth
+                ds.variables["NOMINAL_DEPTH"][:] = nominal_depth
 
-                  # Create variables from input data
-                 for name_in, data_in in zip(var_names, var_data):
-                     ds.createVariable(name_in, "f8", ("TIME"))
-                     ds.variables[name_in][:] = data_in
+                 # Create variables from input data
+                for name_in, data_in in zip(var_names, var_data):
+                    ds.createVariable(name_in, "f8", ("TIME"))
+                    ds.variables[name_in][:] = data_in
 
-                  ds.close()
-                 print("generated ", file_name)
+                ds.close()
+                print("generated ", file_name)
 
-                  return (file_name)
+                return (file_name)
 
-              else:
-                 print('Data arrays not of equal length')
+            else:
+                print('Data arrays not of equal length')
 
- 
-          else:
-             print('Labels not in string format')
 
-      else:
-         print('Data not passed in pairs')
+        else:
+            print('Labels not in string format')
 
- 
-  if __name__ == "__main__":
-     netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:])
\ No newline at end of file
+    else:
+        print('Data not passed in pairs')
+
+
+if __name__ == "__main__":
+    netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:])
+    
+    
+    
+    
+    
+    
+    
\ No newline at end of file

From b6d6168086b3a6d71c7c944d41f5c461d0af3d5a Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 20 Feb 2020 15:07:14 +1100
Subject: [PATCH 23/59] Update netcdf_gen.py

---
 ocean_dp/qc/netcdf_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
index 7eae159..ef403cf 100755
--- a/ocean_dp/qc/netcdf_gen.py
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -74,7 +74,7 @@ def netcdf_gen(file_name, nominal_depth, *args):
                 time_var.setncattr('valid_min', 0)
 
                 t0 = date2num(datetime(2020, 1, 1), units=time_var.units)
-                ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24)
+                ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0]-1)),num=len(var_data[0]))
 
                  # Create the nominal depth variable
                 nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8")

From 2d5bc85a946355b8ee2e96bad1e5093acbac6470 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 20 Feb 2020 16:19:53 +1100
Subject: [PATCH 24/59] adds qc variables

---
 ocean_dp/qc/add_qc_flags.py    |  5 +++-
 ocean_dp/qc/netcdf_gen.py      | 42 ++++++++++++++++++++++++++++++++++
 ocean_dp/qc/select_in_water.py |  2 ++
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/ocean_dp/qc/add_qc_flags.py b/ocean_dp/qc/add_qc_flags.py
index bd68234..1d1fa0e 100644
--- a/ocean_dp/qc/add_qc_flags.py
+++ b/ocean_dp/qc/add_qc_flags.py
@@ -101,4 +101,7 @@ def add_qc(netCDFfile):
 
 
 if __name__ == "__main__":
-    add_qc(sys.argv[1:])
\ No newline at end of file
+    add_qc(sys.argv[1:])
+    
+    
+    
\ No newline at end of file
diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
index ef403cf..32cb37c 100755
--- a/ocean_dp/qc/netcdf_gen.py
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -17,6 +17,7 @@
 import sys
 from datetime import datetime
 import numpy as np
+import shutil
 
  # Provide the function with a filename (don't include .nc), a nominal depth,
 # and pairs of names and arrays containing the data to be included as variables.
@@ -26,6 +27,8 @@
  # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data)
 # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN
 
+
+
 def netcdf_gen(file_name, nominal_depth, *args):
     # Convert the args tuple to a list
     args = list(args)
@@ -93,10 +96,49 @@ def netcdf_gen(file_name, nominal_depth, *args):
                     ds.createVariable(name_in, "f8", ("TIME"))
                     ds.variables[name_in][:] = data_in
 
+                        # read the variable names from the netCDF dataset
+                vars = ds.variables
+        
+                # create a list of variables, don't include the 'TIME' variable
+                # TODO: detect 'TIME' variable using the standard name 'time'
+                to_add = []
+                
+                for v in vars:
+                    #print (vars[v].dimensions)
+                    if v != 'TIME':
+                        to_add.append(v)
+        
+                # for each variable, add a new ancillary variable <VAR>_quality_control to each which has 'TIME' as a dimension
+                for v in to_add:
+                    if "TIME" in vars[v].dimensions:
+                        # print("time dim ", v)
+        
+                        if v+"_quality_control" not in ds.variables:
+                            ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=99 otherwise defaults to max, imos-toolbox uses 99
+                            ncVarOut[:] = np.zeros(vars[v].shape)
+                            ncVarOut.long_name = "quality_code for " + v
+                            ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9])
+                            ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+                            
+        
+                            vars[v].ancillary_variables = v + "_quality_control"
+
+                # update the global attributes
+                ds.file_version = "Level 1 - Quality Controlled Data"
+                
+                ds.history = datetime.utcnow().strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.'
+        
+                # ADD quality control attributes!!
+        
                 ds.close()
+                
+                #add_qc(file_name)
+                
                 print("generated ", file_name)
 
                 return (file_name)
+            
+                
 
             else:
                 print('Data arrays not of equal length')
diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py
index 16cf613..8c296c1 100755
--- a/ocean_dp/qc/select_in_water.py
+++ b/ocean_dp/qc/select_in_water.py
@@ -29,6 +29,8 @@
 
 # Submit argument as a list
 
+
+
 def select_in_water(netCDFfiles):
     
     new_name = [] # list of new file names

From a5bdf2fef52227be1829dbd1e279e5847a6b8126 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 21 Feb 2020 12:00:00 +1100
Subject: [PATCH 25/59] Update flatline_test.py

Removed the add_qc part, modified so it's possibl to set files, target variables, window length, and the qc flag assigned. Tested with files made using netcdf_gen.
---
 ocean_dp/qc/flatline_test.py | 142 +++++++++++------------------------
 1 file changed, 44 insertions(+), 98 deletions(-)

diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
index a4e4ec5..074db32 100755
--- a/ocean_dp/qc/flatline_test.py
+++ b/ocean_dp/qc/flatline_test.py
@@ -1,28 +1,4 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Feb  3 14:10:41 2020
-
-@author: tru050
-"""
-
-import re
-from datetime import datetime, timedelta
-from netCDF4 import num2date, date2num
-from netCDF4 import stringtochar
-import numpy.ma as ma
-import sys
-from netCDF4 import Dataset
-import numpy as np
-import argparse
-import glob
-import pytz
-import os
-
-#!/usr/bin/python3
-
-# add_qc_flags
-# Copyright (C) 2020 Peter Jansen
+# Copyright (C) 2020 Ben Weeding
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -37,111 +13,81 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-# add QC variables to file
-
-
-def add_qc(netCDFfile):
-
-    new_name = [] # list of new file names
-
-    # loop over all file names given
-    for fn in netCDFfile[1:]:
-        ds = Dataset(fn, 'a')
-
-        # read the variable names from the netCDF dataset
-        vars = ds.variables
-
-        # create a list of variables, don't include the 'TIME' variable
-        # TODO: detect 'TIME' variable using the standard name 'time'
-        to_add = []
-        for v in vars:
-            #print (vars[v].dimensions)
-            if v != 'TIME':
-                to_add.append(v)
-
-        # for each variable, add a new ancillary variable <VAR>_quality_control to each which has 'TIME' as a dimension
-        for v in to_add:
-            if "TIME" in vars[v].dimensions:
-                # print("time dim ", v)
-
-                ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=99 otherwise defaults to max, imos-toolbox uses 99
-                ncVarOut[:] = np.zeros(vars[v].shape)
-                ncVarOut.long_name = "quality_code for " + v
-
-                vars[v].ancillary_variables = v + "_quality_control"
-
-        # update the file version attribute
-        ds.file_version = "Level 1 - Quality Controlled Data"
-
-        ds.close()
-
-        # rename the file FV00 to FV01 (imos specific)
-        fn_new = fn.replace("FV00", "FV01")
-        new_name.append(fn_new)
-
-        if fn_new != fn:
-            # copy file
-            os.copy(fn, fn_new)
-
-        print(fn_new)
-
-    return new_name
-
-
-if __name__ == "__main__":
-    add_qc(sys.argv)
-
-##############################################################################
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
 
-def flatline_test(*target_files,target_vars=[],window=3):
+def flatline_test(target_files,target_vars_in=[],window=3,flag=3):
     
-    # If files aren't specified, take all the .nc files in the current folder
+    # If files aren't specified, take all the IMOS.nc files in the current folder
     if not target_files:
         
-        target_files = glob.glob('*.nc')
+        target_files = glob.glob('IMOS*.nc')
     
     # Loop through each files in target_files
     for current_file in target_files:
         
-        
         # Print each filename
         print("input file %s" % current_file)
         
         # Extract netcdf data into nc
-        nc = Dataset(current_file, mode="r")
+        nc = Dataset(current_file, mode="a")
         
         # Extract time
         ncTime = nc.get_variables_by_attributes(standard_name='time')
     
         # If target_vars aren't user specified, set it to all the variables of 
-        # the current_file, removing TIME
-        if target_vars == []:
+        # the current_file, removing unwanted variables
+        if target_vars_in == []:
             
             target_vars = list(nc.variables.keys())
             
+            # Remove TIME
             target_vars.remove('TIME')
             
-        # Check if file contains quality control variables, and if not create
-        
-        if not any("_quality_control" in i for i in target_vars:
-                   
-                   # insert _quality_control variables into file? 
-                   # should this be done now, or should we assume it
-                   # will have already been done?
-                
+            # Remove any quality_control variables
+            qc_vars = [s for s in target_vars if 'quality_control' in s]
+            
+            target_vars = [s for s in target_vars if s not in qc_vars]
+                            
+            # Remove any variables of single length
+            single_vars = [s for s in target_vars if nc.variables[s].size==1]
+            
+            target_vars = [s for s in target_vars if s not in single_vars]
+            
+            print('target_vars are '+' '.join(target_vars))
+            
+        else:
+            target_vars = target_vars_in
             
         # For each variable, extract the data 
         for current_var in target_vars:
             
             var_data = np.array(nc.variables[current_var])
             
-            for i in 0:(len(var_data)-window+1):
+            print('checking '+current_var)
+            
+            # Step through the data, one element at a time, using the window
+            for i in range(0,(len(var_data)-window+1)):
                 
-                    # This is true if 'window' elements in a row are equal
-                if len(set(var_data[i:(i+window)])) == 1
+                # This is true if 'window' elements in a row are equal
+                if len(set(var_data[i:(i+window)])) == 1:
                     
                     # set corresponding QC value to...
+                    nc.variables[current_var+'_quality_control'][i:(i+window)] = flag
+                   
+        nc.history += ' ' + datetime.utcnow().strftime("%Y%m%d:") + 'flatline_test performed, flatlines of '+str(window)+' consecutive values or more were flagged with '+str(flag)            
                     
+        nc.close()
     
             
             

From 05abdb91e2365967043dcba3517f6cfa63467d80 Mon Sep 17 00:00:00 2001
From: Peter Jansen <peter.jansen@csiro.au>
Date: Mon, 24 Feb 2020 10:07:35 +1100
Subject: [PATCH 26/59] Update flatline_test.py

separate global fine name gobblin
---
 ocean_dp/qc/flatline_test.py | 53 ++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
index 074db32..db8fec7 100755
--- a/ocean_dp/qc/flatline_test.py
+++ b/ocean_dp/qc/flatline_test.py
@@ -26,23 +26,31 @@
 import pytz
 import os
 
-def flatline_test(target_files,target_vars_in=[],window=3,flag=3):
-    
-    # If files aren't specified, take all the IMOS.nc files in the current folder
-    if not target_files:
-        
-        target_files = glob.glob('IMOS*.nc')
+
+# If files aren't specified, take all the IMOS*.nc files in the current folder
+def flatline_test_all_files(target_vars_in=[], window=3, flag=3):
+    target_files = glob.glob('IMOS*.nc')
+
+    flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag)
+
+
+def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3):
     
     # Loop through each files in target_files
     for current_file in target_files:
-        
         # Print each filename
         print("input file %s" % current_file)
-        
+
         # Extract netcdf data into nc
         nc = Dataset(current_file, mode="a")
-        
-        # Extract time
+
+        # run the flat line test
+        flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag)
+
+
+def flatline_test(nc, target_vars_in=[], window=3, flag=3):
+
+        # Extract time, TODO: This is not used, should we set the window based on time, not samples?
         ncTime = nc.get_variables_by_attributes(standard_name='time')
     
         # If target_vars aren't user specified, set it to all the variables of 
@@ -56,12 +64,10 @@ def flatline_test(target_files,target_vars_in=[],window=3,flag=3):
             
             # Remove any quality_control variables
             qc_vars = [s for s in target_vars if 'quality_control' in s]
-            
             target_vars = [s for s in target_vars if s not in qc_vars]
                             
             # Remove any variables of single length
             single_vars = [s for s in target_vars if nc.variables[s].size==1]
-            
             target_vars = [s for s in target_vars if s not in single_vars]
             
             print('target_vars are '+' '.join(target_vars))
@@ -84,16 +90,17 @@ def flatline_test(target_files,target_vars_in=[],window=3,flag=3):
                     
                     # set corresponding QC value to...
                     nc.variables[current_var+'_quality_control'][i:(i+window)] = flag
-                   
-        nc.history += ' ' + datetime.utcnow().strftime("%Y%m%d:") + 'flatline_test performed, flatlines of '+str(window)+' consecutive values or more were flagged with '+str(flag)            
-                    
+
+        # update the history attribute
+        try:
+            hist = nc.history + "\n"
+        except AttributeError:
+            hist = ""
+
+        nc.setncattr('history', hist + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
+
         nc.close()
     
-            
-            
-            
-            
-    
-    
-    
-    
\ No newline at end of file
+if __name__ == "__main__":
+    # usage is <file_name> <variable_name> <window> <flag value>
+    flatline_test(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4]))

From 42ef01b2480a95c3e96ffed5778c4a2e09d19313 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 24 Feb 2020 12:16:56 +1100
Subject: [PATCH 27/59] test changes

1st write of spike test, alter name=main  and history for flatline
---
 ocean_dp/qc/flatline_test.py |   7 +-
 ocean_dp/qc/spike_test       |   8 ---
 ocean_dp/qc/spike_test.py    | 120 +++++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 13 deletions(-)
 delete mode 100755 ocean_dp/qc/spike_test
 create mode 100755 ocean_dp/qc/spike_test.py

diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
index db8fec7..e211cf4 100755
--- a/ocean_dp/qc/flatline_test.py
+++ b/ocean_dp/qc/flatline_test.py
@@ -49,9 +49,6 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3):
 
 
 def flatline_test(nc, target_vars_in=[], window=3, flag=3):
-
-        # Extract time, TODO: This is not used, should we set the window based on time, not samples?
-        ncTime = nc.get_variables_by_attributes(standard_name='time')
     
         # If target_vars aren't user specified, set it to all the variables of 
         # the current_file, removing unwanted variables
@@ -97,10 +94,10 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3):
         except AttributeError:
             hist = ""
 
-        nc.setncattr('history', hist + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
+        nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
 
         nc.close()
     
 if __name__ == "__main__":
     # usage is <file_name> <variable_name> <window> <flag value>
-    flatline_test(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4]))
+    flatline_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4]))
diff --git a/ocean_dp/qc/spike_test b/ocean_dp/qc/spike_test
deleted file mode 100755
index 0bb13ea..0000000
--- a/ocean_dp/qc/spike_test
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Feb 12 16:28:38 2020
-
-@author: tru050
-"""
-
diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py
new file mode 100755
index 0000000..c18c177
--- /dev/null
+++ b/ocean_dp/qc/spike_test.py
@@ -0,0 +1,120 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+
+# If files aren't specified, take all the IMOS*.nc files in the current folder
+def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    target_files = glob.glob('IMOS*.nc')
+
+    spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
+
+
+def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    
+    # Loop through each files in target_files
+    for current_file in target_files:
+        # Print each filename
+        print("input file %s" % current_file)
+
+        # Extract netcdf data into nc
+        nc = Dataset(current_file, mode="a")
+
+        # run the spike test
+        spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
+
+
+def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    
+        # If target_vars aren't user specified, set it to all the variables of 
+        # the current_file, removing unwanted variables
+        if target_vars_in == []:
+            
+            target_vars = list(nc.variables.keys())
+            
+            # Remove TIME
+            target_vars.remove('TIME')
+            
+            # Remove any quality_control variables
+            qc_vars = [s for s in target_vars if 'quality_control' in s]
+            target_vars = [s for s in target_vars if s not in qc_vars]
+                            
+            # Remove any variables of single length
+            single_vars = [s for s in target_vars if nc.variables[s].size==1]
+            target_vars = [s for s in target_vars if s not in single_vars]
+            
+            print('target_vars are '+' '.join(target_vars))
+            
+        else:
+            target_vars = target_vars_in
+            
+        # For each variable, extract the data 
+        for current_var in target_vars:
+            
+            var_data = np.array(nc.variables[current_var])
+            
+            print('checking '+current_var)
+            
+            # Step through the data, one element at a time, starting from the 2nd element
+            for i in range(1,(len(var_data)-1)):
+                
+                # Calculate the mean of the i-1 and i+1 elements
+                shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
+                
+                # Check for spike exceeding high threshold
+                if abs(var_data[i]-shoulder_mean) > thresh_high:
+                    
+                    #set corresponding QC value to...
+                    nc.variables[current_var+'_quality_control'][i] = flag_high
+
+                
+                # Check for spike exceeding low threshold
+                elif abs(var_data[i]-shoulder_mean) > thresh_low:
+                    
+                    # set corresponding QC value to...
+                    nc.variables[current_var+'_quality_control'][i] = flag_low
+
+        # update the history attribute
+        try:
+            hist = nc.history + "\n"
+        except AttributeError:
+            hist = ""
+
+        nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low))
+
+        nc.close()
+    
+if __name__ == "__main__":
+    # usage is <file_name> <variable_name> <window> <flag value>
+    spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6]))
+
+    
+
+
+
+
+

From 925101f749a8700c993023a821088918a4522478 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 2 Mar 2020 15:56:30 +1100
Subject: [PATCH 28/59] Update spike_test.py

---
 ocean_dp/qc/spike_test.py | 134 ++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py
index c18c177..154f439 100755
--- a/ocean_dp/qc/spike_test.py
+++ b/ocean_dp/qc/spike_test.py
@@ -50,63 +50,113 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=
 
 def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
     
-        # If target_vars aren't user specified, set it to all the variables of 
-        # the current_file, removing unwanted variables
-        if target_vars_in == []:
+    # If target_vars aren't user specified, set it to all the variables of 
+    # the current_file, removing unwanted variables
+    if target_vars_in == []:
+        
+        target_vars = list(nc.variables.keys())
+        
+        # Remove TIME
+        target_vars.remove('TIME')
+        
+        # Remove any quality_control variables
+        qc_vars = [s for s in target_vars if 'quality_control' in s]
+        target_vars = [s for s in target_vars if s not in qc_vars]
+                        
+        # Remove any variables of single length
+        single_vars = [s for s in target_vars if nc.variables[s].size==1]
+        target_vars = [s for s in target_vars if s not in single_vars]
+        
+        print('target_vars are '+' '.join(target_vars))
+        
+    else:
+        target_vars = target_vars_in
+        
+    # For each variable, extract the data 
+    for current_var in target_vars:
+        
+        var_data = np.array(nc.variables[current_var])
+        
+        print('checking '+current_var+' for high spikes')
+        
+        # Step through the data, one element at a time, starting from the 2nd element
+        for i in range(1,(len(var_data)-1)):
             
-            target_vars = list(nc.variables.keys())
+            # Calculate the mean of the i-1 and i+1 elements
+            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
             
-            # Remove TIME
-            target_vars.remove('TIME')
+            # Check for spike exceeding high threshold
+            if abs(var_data[i]-shoulder_mean) > thresh_high:
+                
+                print('High spike found')
+                
+                #set corresponding QC value to...
+                nc.variables[current_var+'_quality_control'][i] = flag_high
+                
+        # # Extract the qc data         
+        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
+        
+        # # Find all the instances of consecutive 4s, and reset them to 0        
+        # for i in np.where(current_qc==4)[0][0:-1][np.diff(np.where(current_qc==4)[0])==1]:
+
+        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0       
+        
+        # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike
+        low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1]
+        
+        #print(low_spike_chk_idx)
+        
+        # Remove from the indices those that are either side of a high spike
+        for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]:
+            
+            low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i-1]]
             
-            # Remove any quality_control variables
-            qc_vars = [s for s in target_vars if 'quality_control' in s]
-            target_vars = [s for s in target_vars if s not in qc_vars]
-                            
-            # Remove any variables of single length
-            single_vars = [s for s in target_vars if nc.variables[s].size==1]
-            target_vars = [s for s in target_vars if s not in single_vars]
+            low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i+1]]
+        
+        #print(low_spike_chk_idx)
+        
+        print('checking '+current_var+' for low spikes')
+        
+        # For each of the remaining indices
+        for i in low_spike_chk_idx:
             
-            print('target_vars are '+' '.join(target_vars))
+            #print('i is '+str(i))
             
-        else:
-            target_vars = target_vars_in
+            # Calculate the mean of the i-1 and i+1 elements
+            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
             
-        # For each variable, extract the data 
-        for current_var in target_vars:
+            #print('shoulder mean is '+str(shoulder_mean))
             
-            var_data = np.array(nc.variables[current_var])
+            abs_diff = abs(var_data[i]-shoulder_mean)
             
-            print('checking '+current_var)
+            #print('absolute difference is '+str(abs_diff))
             
-            # Step through the data, one element at a time, starting from the 2nd element
-            for i in range(1,(len(var_data)-1)):
+            # Check for spike exceeding low threshold
+            if abs(var_data[i]-shoulder_mean) > thresh_low:
                 
-                # Calculate the mean of the i-1 and i+1 elements
-                shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
+                print('Low spike found')
                 
-                # Check for spike exceeding high threshold
-                if abs(var_data[i]-shoulder_mean) > thresh_high:
-                    
-                    #set corresponding QC value to...
-                    nc.variables[current_var+'_quality_control'][i] = flag_high
+                #set corresponding QC value to...
+                nc.variables[current_var+'_quality_control'][i] = flag_low        
+
+        # # Extract the qc data         
+        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
+        
+        # # Find all the instances of consecutive 3s, and reset them to 0        
+        # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]:
+
+        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0     
 
-                
-                # Check for spike exceeding low threshold
-                elif abs(var_data[i]-shoulder_mean) > thresh_low:
-                    
-                    # set corresponding QC value to...
-                    nc.variables[current_var+'_quality_control'][i] = flag_low
 
-        # update the history attribute
-        try:
-            hist = nc.history + "\n"
-        except AttributeError:
-            hist = ""
+    # update the history attribute
+    try:
+        hist = nc.history + "\n"
+    except AttributeError:
+        hist = ""
 
-        nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low))
+    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low))
 
-        nc.close()
+    nc.close()
     
 if __name__ == "__main__":
     # usage is <file_name> <variable_name> <window> <flag value>

From 62b0419a1fcbb368a152728680bf3586c060d77c Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 3 Mar 2020 12:42:42 +1100
Subject: [PATCH 29/59] Update netcdf_gen.py

Brought -1 outside brackets, noticed that netcdfgen was not producing hourly data when using for rate of change test.
---
 ocean_dp/qc/netcdf_gen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py
index 32cb37c..1b56de8 100755
--- a/ocean_dp/qc/netcdf_gen.py
+++ b/ocean_dp/qc/netcdf_gen.py
@@ -77,7 +77,7 @@ def netcdf_gen(file_name, nominal_depth, *args):
                 time_var.setncattr('valid_min', 0)
 
                 t0 = date2num(datetime(2020, 1, 1), units=time_var.units)
-                ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0]-1)),num=len(var_data[0]))
+                ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0])-1),num=len(var_data[0]))
 
                  # Create the nominal depth variable
                 nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8")
@@ -96,7 +96,8 @@ def netcdf_gen(file_name, nominal_depth, *args):
                     ds.createVariable(name_in, "f8", ("TIME"))
                     ds.variables[name_in][:] = data_in
 
-                        # read the variable names from the netCDF dataset
+
+                # read the variable names from the netCDF dataset
                 vars = ds.variables
         
                 # create a list of variables, don't include the 'TIME' variable

From cc0589762a76caea2a6334a918883ff77bfeb02b Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 4 Mar 2020 16:32:16 +1100
Subject: [PATCH 30/59] Create rate_of_change_test.py

Haven't yet worked out how to use sys.argv[] with both *args and a keyword argument - line 178
---
 ocean_dp/qc/rate_of_change_test.py | 184 +++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100755 ocean_dp/qc/rate_of_change_test.py

diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py
new file mode 100755
index 0000000..c3e1d9b
--- /dev/null
+++ b/ocean_dp/qc/rate_of_change_test.py
@@ -0,0 +1,184 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+# how to specific rate of change? Will the function take rate of change as an 
+# argument?
+
+# Need a way of linking the rate of change to each different variable type
+
+# Could max rate of change be something we upload into the netcdf file atts? Or just in history?
+
+# Tell the function how much change you tolerate, and over what period of time - in sec?
+
+# Then convert to match the file timesteps
+
+# If files aren't specified, take all the IMOS*.nc files in the current folder
+def roc_test_all_files(*args,target_vars_in=[]):
+    target_files = glob.glob('IMOS*.nc')
+
+    roc_test_files(target_files, target_vars_in=target_vars_in,*args)
+
+def roc_test_files(target_files,*args,target_vars_in=[]):
+    
+    # Loop through each files in target_files
+    for current_file in target_files:
+        # Print each filename
+        print("input file %s" % current_file)
+        
+        print(args)
+
+        # Extract netcdf data into nc
+        nc = Dataset(current_file, mode="a")
+
+        # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items
+        roc_test(nc,*args, target_vars_in=target_vars_in)
+
+
+# Enter args as variable name and rate of change limit, ie. 'TEMP',4
+def roc_test(nc,*args,target_vars_in=[]):
+    
+    # Check the time format
+    if nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+        
+        # Convert the args tuple to a list
+        args = list(args)
+        
+        # If a single rate of change limit is supplied
+        if len(args) == 1:
+            
+            change_per_hr = args[0]
+        
+            print('One rate of change limit will be applied to all variables')
+            
+            # If target_vars aren't user specified, set it to all the variables of 
+            # the current_file, removing unwanted variables
+            if target_vars_in == []:
+                
+                target_vars = list(nc.variables.keys())
+                
+                # Remove TIME
+                target_vars.remove('TIME')
+                
+                # Remove any quality_control variables
+                qc_vars = [s for s in target_vars if 'quality_control' in s]
+                target_vars = [s for s in target_vars if s not in qc_vars]
+                                
+                # Remove any variables of single length
+                single_vars = [s for s in target_vars if nc.variables[s].size==1]
+                target_vars = [s for s in target_vars if s not in single_vars]
+                
+                print('target_vars are '+' '.join(target_vars))
+                
+            else:
+                target_vars = target_vars_in
+            
+            # Extract the time data
+            nc_time = np.array(nc.variables['TIME'][:])
+            
+            # Convert from days to hours
+            nc_time_hr = nc_time*24
+            
+            # For each variable
+            for current_var in target_vars:
+                
+                # Extract the data
+                var_data = np.array(nc.variables[current_var])
+                
+                # Calculate dvar/dtime
+                var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
+                
+                # For any change greater than change_per_hr, assign a qc value of 4
+                nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+                    
+            # update the history attribute
+            try:
+                hist = nc.history + "\n"
+            except AttributeError:
+                hist = ""
+    
+            nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'rate of change test performed, with all changes above '+str(change_per_hr)+' flagged as 4')        
+            
+
+        # If multiple rate of change limits are supplied, with variable names
+        elif len(args) % 2 == 0 and all(isinstance(x,str) for x in args[0::2]) and all(isinstance(y,(float,int)) for y in args[1::2]):
+            
+            # Take target variables from args
+            target_vars = args[0::2]
+            
+            print('target_vars are '+' '.join(target_vars))
+        
+            # Convert arguments to dict
+            rate_spec = dict(zip(args[0::2],args[1::2]))
+            
+            # Extract the time data
+            nc_time = np.array(nc.variables['TIME'][:])
+            
+            # Convert from days to hours
+            nc_time_hr = nc_time*24
+            
+            # For each variable
+            for current_var in target_vars:
+                
+                # Extract the data
+                var_data = np.array(nc.variables[current_var])
+                
+                # Calculate dvar/dtime
+                var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
+                
+                # For any change greater than change_per_hr, assign a qc value of 4
+                nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                    
+            # update the history attribute
+            try:
+                hist = nc.history + "\n"
+            except AttributeError:
+                hist = ""
+    
+            nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': rate of change test performed, with all changes above those specified in the following list flagged as 4: '+str(args))        
+            
+            
+        else:
+            print('Arguments passed do not match the required format. No roc test performed.')
+        
+        
+    # If the time format doesn't match IMOS requirements    
+    else:
+        print('Time format does not match the required IMOS form of: days since 1950-01-01 00:00:00 UTC')
+    
+    
+    nc.close()
+    
+    
+# Not sure how to sys.argv[] with both *args and a keyword argument
+if __name__ == "__main__":
+    # usage is <file_name> <variable_name> <*args>
+    roc_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], *sys.argv[3:])
+
+    
+
+

From 24e5efd24cb3c87d7d5273b437ab6ea7e8a1771b Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 11 Mar 2020 19:56:10 +1100
Subject: [PATCH 31/59] Create temp_diff_histograms.py

---
 ocean_dp/qc/temp_diff_histograms.py | 109 ++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100755 ocean_dp/qc/temp_diff_histograms.py

diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py
new file mode 100755
index 0000000..2a78a92
--- /dev/null
+++ b/ocean_dp/qc/temp_diff_histograms.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+
+netcdf_files = []
+
+temp_diffs = np.array([])
+
+for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    for file,dirx in files,dirs:
+        if file.endswith('.nc'):
+            netcdf_files.append(file)
+            nc = Dataset(os.path.join(dirs, file),mode='r')
+            
+            temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:]))))
+            
+            nc.close()
+            
+
+print (list_of_files)
+    
+    
+    
+
+
+files = glob.glob('*.nc')
+
+temp_diffs = np.array([])
+
+for current_file in files:
+    
+    nc = Dataset(current_file,mode='r')
+    
+    temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:]))))
+
+fig, ax = plt.subplots()
+
+ax.hist(temp_diffs,100,log=True)
+
+
+# use os.walk??? to run in each netcdf folder?? os.scandir()?
+
+
+
+
+
+# sofs75_60m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4051-60m_END-20190331_C-20200204.nc',mode='r')
+# sofs75_70m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4052-70m_END-20190331_C-20200204.nc',mode='r')
+# sofs75_75m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4053-75m_END-20190331_C-20200204.nc',mode='r')
+
+
+# temp_60 = np.array(sofs75_60m.variables['TEMP'][:])
+# temp_70 = np.array(sofs75_70m.variables['TEMP'][:])
+# temp_75 = np.array(sofs75_75m.variables['TEMP'][:])
+
+# label_coords = (0.01, 0.85)
+# label_method = 'axes fraction'
+
+# fig, axs = plt.subplots(3, 1, sharey=True)
+# axs[0].set_title('Temp sensor comparison SOFS7.5')
+
+# axs[0].hist(np.diff(temp_60),bins=100,log=True, histtype='bar', stacked=True)
+# axs[0].set_ylim(bottom=0.1,top=10E5)
+# axs[0].set_xlim(left=-40, right=40)
+# axs[0].annotate('60m',xy=label_coords, xycoords=label_method)
+# axs[0].tick_params(labelbottom=False)
+
+# axs[1].hist(np.diff(temp_70),bins=100,log=True)
+# axs[1].set_ylim(bottom=0.1,top=10E5)
+# axs[1].set_xlim(left=-40, right=40)
+# axs[1].annotate('70m',xy=label_coords, xycoords=label_method)
+# axs[1].tick_params(labelbottom=False)
+
+# axs[2].hist(np.diff(temp_75),bins=100,log=True)
+# axs[2].set_ylim(bottom=0.1,top=10E5)
+# axs[2].set_xlim(left=-40, right=40)
+# axs[2].annotate('75m',xy=label_coords, xycoords=label_method)
+
+
+
+# fig.savefig('test.pdf')
+
+
+
+
+
+

From fd81c7504b9af279448d544fb4b1da1229904d2c Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 16 Mar 2020 18:05:27 +1100
Subject: [PATCH 32/59] plots for qc testing

---
 ocean_dp/qc/agg_temp_plot.py        | 104 ++++++++++++
 ocean_dp/qc/in_out_water.py         |  10 ++
 ocean_dp/qc/temp_diff_hist_extra.py |  67 ++++++++
 ocean_dp/qc/temp_diff_hist_glob     | 121 ++++++++++++++
 ocean_dp/qc/temp_diff_hist_glob.py  | 119 ++++++++++++++
 ocean_dp/qc/temp_diff_histograms.py | 244 +++++++++++++++++++++++++++-
 ocean_dp/qc/temp_time_diff_plots.py | 117 +++++++++++++
 7 files changed, 773 insertions(+), 9 deletions(-)
 create mode 100755 ocean_dp/qc/agg_temp_plot.py
 create mode 100755 ocean_dp/qc/temp_diff_hist_extra.py
 create mode 100755 ocean_dp/qc/temp_diff_hist_glob
 create mode 100755 ocean_dp/qc/temp_diff_hist_glob.py
 create mode 100755 ocean_dp/qc/temp_time_diff_plots.py

diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py
new file mode 100755
index 0000000..cb92107
--- /dev/null
+++ b/ocean_dp/qc/agg_temp_plot.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+
+
+x=Dataset('IMOS_ABOS-SOTS_COPSTIP_20180822_SOFS_FV02_SOFS-Aggregate-TEMP_END-20190322_C-20200311.nc',mode='r')
+
+temp = np.array(x.variables['TEMP'][:])
+
+time = np.array(x.variables['TIME'][:])
+
+ins_idx = np.array(x.variables['instrument_index'][:])
+
+fig, ax = plt.subplots(6,5)
+
+ax=ax.flatten()
+
+label_coords = (0.1, 0.8)
+label_method = 'axes fraction' 
+
+for i in set(np.array(ins_idx)):
+
+    ax[i].plot(time[ins_idx==i],temp[ins_idx==i])
+    
+    ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8)
+    
+i=1
+fig, ax = plt.subplots()
+ax.plot(time[ins_idx==i],temp[ins_idx==i])
+
+
+# Remove bad instruments
+good_vals = [a!=14 and a!=15 for a in ins_idx]
+
+fig, ax = plt.subplots()
+ax.hist(temp[good_vals],21)
+
+sofs75_temp_diffs = np.array([])
+
+good_ins = set(np.array(ins_idx))
+
+good_ins -= {14,15}
+
+for i in good_ins:
+    
+    cur_temp = temp[ins_idx==i]
+    
+    cur_time = time[ins_idx==i]
+    
+    cur_time_hr = cur_time*24
+    
+    # Calculate time changes
+    cur_time_hr_diffs = np.diff(cur_time_hr)
+    
+    cur_temp_diffs = np.diff(cur_temp)
+    
+    # Calculate the rate of change of temperature wrt time
+    cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs)
+    
+    print('ins '+str(i)+':'+str(np.max(cur_dtemp_dtime)))
+    
+    sofs75_temp_diffs = np.concatenate((sofs75_temp_diffs,cur_dtemp_dtime))
+    
+    
+    
+    
+    
+    
+    
+    
+
+
+
+
+
+
+
+
+
diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py
index ac14840..2543c23 100644
--- a/ocean_dp/qc/in_out_water.py
+++ b/ocean_dp/qc/in_out_water.py
@@ -60,6 +60,16 @@ def in_out_water(netCDFfile):
 
 
     ds.file_version = "Level 1 - Quality Controlled Data"
+    
+    # update the history attribute
+    try:
+        hist = nc.history + "\n"
+        
+    except AttributeError:
+        hist = ""
+    
+    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7')        
+            
 
     ds.close()
 
diff --git a/ocean_dp/qc/temp_diff_hist_extra.py b/ocean_dp/qc/temp_diff_hist_extra.py
new file mode 100755
index 0000000..d8b6019
--- /dev/null
+++ b/ocean_dp/qc/temp_diff_hist_extra.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    for fname in files:
+      
+        if fname.endswith('.nc') and 'FV01' in fname:
+        
+            print(fname)  #Here, the wanted file name is printed
+
+            ds = Dataset(os.path.join(root,fname), 'a')
+        
+            vars = ds.variables
+        
+            to_add = []
+            for v in vars:
+                #print (vars[v].dimensions)
+                if v != 'TIME':
+                    to_add.append(v)
+        
+            time_var = vars["TIME"]
+            time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+        
+            time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
+            time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
+        
+            print(time_deploy)
+        
+            print(to_add)
+            for v in to_add:
+                if "TIME" in vars[v].dimensions:
+        
+                    if v.endswith("_quality_control"):
+        
+                        print("QC time dim ", v)
+        
+                        ncVarOut = vars[v]
+                        mask = (time <= time_deploy) | (time >= time_recovery)
+                        ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7
+        
+        
+            ds.file_version = "Level 1 - Quality Controlled Data"
+            
+            # update the history attribute
+            try:
+                hist = ds.history + "\n"
+                
+            except AttributeError:
+                hist = ""
+            
+            ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7')        
+                    
+
+            ds.close()
\ No newline at end of file
diff --git a/ocean_dp/qc/temp_diff_hist_glob b/ocean_dp/qc/temp_diff_hist_glob
new file mode 100755
index 0000000..a87ca72
--- /dev/null
+++ b/ocean_dp/qc/temp_diff_hist_glob
@@ -0,0 +1,121 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+import glob
+
+deployments = []
+
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+        
+        deployments.append(x)
+        
+
+fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc")
+
+
+fig, ax = plt.subplots(4,4,sharex='all', sharey='all')
+
+ax=ax.flatten()
+
+
+for current_deployment, plt_idx in zip(deployments, range(0,16)):
+    
+    
+    
+    for fname in files:
+      
+        if fname.find(current_deployment) and fname.endswith('.nc') and 'FV01' in fname:
+        
+            print(fname)  #Here, the wanted file name is printed
+
+            nc = Dataset(os.path.join(root,fname), mode = 'r')
+            
+            if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                
+                # Calculate temperature changes
+                nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7]))
+                
+                # Extract the time data
+                nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7])
+            
+                # Convert from days to hours
+                nc_time_hr = nc_time*24
+                
+                # Calculate time changes
+                nc_time_hr_diffs = np.diff(nc_time_hr)
+                
+                # Calculate the rate of change of temperature wrt time
+                nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                
+                # Add the results for this netcdf to the record for all files
+                #all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime))
+                
+                #all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime))
+                
+                #netcdffiles.append(fname)
+                
+                #mins.append(np.amin(nc_dtemp_dtime))
+                
+                #maxs.append(np.amax(nc_dtemp_dtime))
+                
+                
+            
+            nc.close()
+            
+    ax[plt_idx].hist(nc_dtemp_dtime,100,log=True)
+    
+    ax[plt_idx].set_ylim(bottom=0.1,top=10E5)
+
+    ax[plt_idx].set_xlim(left=-500, right=500)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ocean_dp/qc/temp_diff_hist_glob.py b/ocean_dp/qc/temp_diff_hist_glob.py
new file mode 100755
index 0000000..1e63d14
--- /dev/null
+++ b/ocean_dp/qc/temp_diff_hist_glob.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+import glob
+
+deployments = []
+
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+        
+        deployments.append(x)
+        
+
+deployment_dtemp_dtime = np.array([])
+
+fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc")
+
+
+fig, ax = plt.subplots(4,4,sharex='all', sharey='all')
+
+ax=ax.flatten()
+
+
+for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))):
+    
+    print(current_deployment + 'files')
+    
+    for fname in fv01_files:
+      
+        if current_deployment in fname:
+        
+            #print(fname + ' contains ' + current_deployment)  #Here, the wanted file name is printed
+
+            nc = Dataset(fname, mode = 'r')
+            
+            if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1:
+                
+                print(fname)
+                
+                # Calculate temperature changes
+                nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7]))
+                
+                # Extract the time data
+                nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7])
+            
+                # Convert from days to hours
+                nc_time_hr = nc_time*24
+                
+                # Calculate time changes
+                nc_time_hr_diffs = np.diff(nc_time_hr)
+                
+                # Calculate the rate of change of temperature wrt time
+                nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                
+                # Add the results for this netcdf to the record for the deployment
+                deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+                
+            nc.close()
+            
+    print('plotting '+ str(len(deployment_dtemp_dtime)) + ' values')
+            
+    ax[plt_idx].hist(deployment_dtemp_dtime,100)
+    
+    #ax[plt_idx].set_ylim(bottom=0.1,top=10E5)
+
+    #ax[plt_idx].set_xlim(left=-10, right=10)
+    
+    deployment_dtemp_dtime = np.array([])
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py
index 2a78a92..2003afc 100755
--- a/ocean_dp/qc/temp_diff_histograms.py
+++ b/ocean_dp/qc/temp_diff_histograms.py
@@ -14,7 +14,8 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 import numpy.ma as ma
 import sys
-from netCDF4 import Dataset
+from netCDF4 import Dataset, num2date
+from dateutil import parser
 import numpy as np
 import argparse
 import glob
@@ -23,26 +24,249 @@
 import matplotlib.pyplot as plt
 from matplotlib import colors
 from matplotlib.ticker import PercentFormatter
+from sigfig import round
 
-netcdf_files = []
+deployments = []
 
-temp_diffs = np.array([])
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+        
+        deployments.append(x)
+
+
+
+# check for in water test in history of netcdf file, if not perform the test
+
+netcdffiles = []
+
+mins=[]
+
+maxs=[]
+
+all_dtemp_dtime = np.array([])
+
+all_dtemp_dtime_deps = []
 
 for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
-    for file,dirx in files,dirs:
-        if file.endswith('.nc'):
-            netcdf_files.append(file)
-            nc = Dataset(os.path.join(dirs, file),mode='r')
+    
+    for fname in files:
+      
+        if fname.endswith('.nc') and 'FV01' in fname:
+        
+            print(fname)  #Here, the wanted file name is printed
+
+      
+            nc = Dataset(os.path.join(root,fname), mode = 'r')
+            
+            if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                
+                # Calculate temperature changes
+                nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7]))
+                
+                # Extract the time data
+                nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7])
             
-            temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:]))))
+                # Convert from days to hours
+                nc_time_hr = nc_time*24
+                
+                # Calculate time changes
+                nc_time_hr_diffs = np.diff(nc_time_hr)
+                
+                # Calculate the rate of change of temperature wrt time
+                nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                
+                # Add the results for this netcdf to the record for all files
+                all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime))
+                
+                all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime))
+                
+                netcdffiles.append(fname)
+                
+                mins.append(np.amin(nc_dtemp_dtime))
+                
+                maxs.append(np.amax(nc_dtemp_dtime))
             
             nc.close()
+
+
+fig, ax = plt.subplots()
+
+bins = np.linspace(-450,450,901)
+
+line_thick = 0.5
+
+counts,bins,bars = ax.hist(all_dtemp_dtime,bins,log=True)                   
+
+ax.axvline(x=3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) 
+
+ax.axvline(x=-3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) 
+
+ax.set_title('Hourly temp changes from all FV01 files in SOTS-TEMP-Raw_Data')    
+
+label_coords = (0.01, 0.9)
+
+label_method = 'axes fraction'
+
+ax.annotate('~1.84E7 measurements',xy=label_coords, xycoords=label_method)
+
+
+
+def last_four(entry):
+    
+    output = entry[-4::]
+    
+    return output
+
+
+deployments = []
+
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+        
+        deployments.append(x)
+
+deployments.sort(key=last_four)
+
+
+
+
+
+
+
+all_deployment_dtemp_dtime = [None] * len(deployments)
+
+for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))):
+    
+    print('current deployment is '+current_deployment)
+    
+    deployment_dtemp_dtime = np.array([])
+    
+    for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+        for fname in files:
+          
+            if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname:
+            
+                #print(fname)  #Here, the wanted file name is printed
+
+                nc = Dataset(os.path.join(root,fname), mode = 'r')
+                
+                if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                    
+                    time_var = nc.variables["TIME"]
+                    
+                    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+                
+                    time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+                    
+                    time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+                    
+                    #print('using '+fname)
+                    
+                    temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)])
+                    
+                    # Calculate temperature changes
+                    nc_temp_diffs = np.diff(temp_extract)
+                    
+                    # Extract the time data
+                    nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)])
+                
+                    # Convert from days to hours
+                    nc_time_hr = nc_time*24
+                    
+                    # Calculate time changes
+                    nc_time_hr_diffs = np.diff(nc_time_hr)
+                    
+                    # Calculate the rate of change of temperature wrt time
+                    nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                    
+                    # Add the results for this netcdf to the record for the deployment
+                    deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+                    
+                    all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime
+                
+                nc.close()
+                
+                
+
+
+
+
+
+
+fig, ax = plt.subplots(4,4)
+
+ax=ax.flatten()             
+
+line_thick = 1  
+
+label_coords = (0.6, 0.6)
+label_method = 'axes fraction' 
+                
+for plt_idx,dep_name in zip(range(0,len(deployments)),deployments):            
+
+    print('plotting '+ str(len(all_deployment_dtemp_dtime[plt_idx])) + ' values')
             
+    hist_data = ax[plt_idx].hist(all_deployment_dtemp_dtime[plt_idx],21,log=True)
+    
+    ax[plt_idx].set_title(dep_name,fontsize=10) 
+    
+    #ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick)
+    
+    ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])+3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) 
 
-print (list_of_files)
+    ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])-3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) 
     
+    anno = 'mean = '+str(round(float(np.mean(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3))
     
+    anno += '\n3SD = ' + str(round(float(3*np.std(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3))
+    
+    anno += '\nsamples = ' + str(len(all_deployment_dtemp_dtime[plt_idx]))
+    
+    ax[plt_idx].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
+    
+    #ax[plt_idx].set_ylim(bottom=0,top=np.max(hist_data[0]))
+    
+    #ax[plt_idx].set_xlim(left=-450, right=450)      np.linspace(-450,450,901)
+    
+#ax[-1].axis('off')
+    
+all_data = np.concatenate(all_deployment_dtemp_dtime)    
+    
+hist_data = ax[15].hist(all_data,21,log=True)
+
+ax[15].set_title('All data',fontsize=10) 
+
+#ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick)
+
+ax[15].axvline(x=np.mean(all_data)+3*np.std(all_data),color='r',linewidth=line_thick) 
+
+ax[15].axvline(x=np.mean(all_data)-3*np.std(all_data),color='r',linewidth=line_thick) 
+
+anno = 'mean = '+str(round(float(np.mean(all_data)),sigfigs=3))
+
+anno += '\n3SD = ' + str(round(float(3*np.std(all_data)),sigfigs=3))
+
+anno += '\nsamples = ' + str(len(all_data))
+
+ax[15].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
+
     
+fig.subplots_adjust(left=0.05,right=0.99,bottom=0.1,top=0.9,wspace=0.15,hspace=0.4)
+
+
+
+
+
+
+
+
+
+
+
+
 
 
 files = glob.glob('*.nc')
@@ -60,6 +284,8 @@
 ax.hist(temp_diffs,100,log=True)
 
 
+
+
 # use os.walk??? to run in each netcdf folder?? os.scandir()?
 
 
diff --git a/ocean_dp/qc/temp_time_diff_plots.py b/ocean_dp/qc/temp_time_diff_plots.py
new file mode 100755
index 0000000..cf42a13
--- /dev/null
+++ b/ocean_dp/qc/temp_time_diff_plots.py
@@ -0,0 +1,117 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+
+
+
+def last_four(entry):
+    
+    output = entry[-4::]
+    
+    return output
+
+
+deployments = []
+
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+        
+        deployments.append(x)
+
+deployments.sort(key=last_four)
+
+
+fig, ax = plt.subplots(4,4)
+
+ax=ax.flatten()  
+
+
+all_deployment_dtemp_dtime = [None] * len(deployments)
+
+for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))):
+    
+    print('current deployment is '+current_deployment)
+    
+    deployment_dtemp_dtime = np.array([])
+    
+    for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+        for fname in files:
+          
+            if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname:
+            
+                #print(fname)  #Here, the wanted file name is printed
+
+                nc = Dataset(os.path.join(root,fname), mode = 'r')
+                
+                if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                    
+                    time_var = nc.variables["TIME"]
+                    
+                    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+                
+                    time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+                    
+                    time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+                    
+                    #print('using '+fname)
+                    
+                    temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)])
+                    
+                    # Calculate temperature changes
+                    nc_temp_diffs = np.diff(temp_extract)
+                    
+                    # Extract the time data
+                    nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)])
+                
+                    # Convert from days to hours
+                    nc_time_hr = nc_time*24
+                    
+                    ax[plt_idx].plot(nc_time,temp_extract)
+                    
+                    # Calculate time changes
+                    nc_time_hr_diffs = np.diff(nc_time_hr)
+                    
+                    # Calculate the rate of change of temperature wrt time
+                    nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                    
+                    # Add the results for this netcdf to the record for the deployment
+                    deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+                    
+                    all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime
+                
+                nc.close()
+                
+                
+                
+
+
+
+
+

From 222024a2dad9ce9e57992abc112c43a968908adc Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 17 Mar 2020 16:41:42 +1100
Subject: [PATCH 33/59] new plotting codes

---
 ocean_dp/qc/agg_temp_plot.py                  |  16 +-
 ocean_dp/qc/temp_diff_timeseries_from_fv00.py | 156 ++++++++++++++++++
 2 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100755 ocean_dp/qc/temp_diff_timeseries_from_fv00.py

diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py
index cb92107..9902493 100755
--- a/ocean_dp/qc/agg_temp_plot.py
+++ b/ocean_dp/qc/agg_temp_plot.py
@@ -45,7 +45,21 @@
 
 for i in set(np.array(ins_idx)):
 
-    ax[i].plot(time[ins_idx==i],temp[ins_idx==i])
+    cur_temp = temp[ins_idx==i]
+    
+    cur_time = time[ins_idx==i]
+    
+    cur_time_hr = cur_time*24
+    
+    # Calculate time changes
+    cur_time_hr_diffs = np.diff(cur_time_hr)
+    
+    cur_temp_diffs = np.diff(cur_temp)
+    
+    # Calculate the rate of change of temperature wrt time
+    cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs)
+    
+    ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),cur_dtemp_dtime)),cmap='cool')
     
     ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8)
     
diff --git a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py
new file mode 100755
index 0000000..b21a605
--- /dev/null
+++ b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py
@@ -0,0 +1,156 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+
+def last_four(entry):
+    
+    output = entry[-4::]
+    
+    return output
+
+
+def sp_layout(num_in):
+    
+    sp_nums = np.array([1,2,4,6,9,12,16,20,25,30])
+    
+    sp_dict={1:[1,1],2:[2,1],4:[2,2],6:[3,2],9:[3,3],12:[4,3],16:[4,4],20:[5,4],25:[5,5],30:[6,5]}
+    
+    return sp_dict[sp_nums[np.where(num_in<=sp_nums)[0][0]]]
+
+
+
+deployments = []
+
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    if ('Pulse' in x) or ('SOFS' in x):
+
+        
+        deployments.append(x)
+
+deployments.sort(key=last_four)
+
+
+
+
+
+
+for current_deployment in deployments:
+    
+    acceptable_files = []
+    
+    print('current deployment is '+current_deployment)
+    
+    deployment_dtemp_dtime = np.array([])
+    
+    for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+        for fname in files:
+            
+            print('checking '+fname)
+          
+            if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname:
+                
+                print('opening '+fname)
+                
+                nc = Dataset(os.path.join(root,fname), mode = 'r')
+            
+                if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                
+                    acceptable_files.append(fname)
+                    
+                    print(fname+' accepted')
+                
+                nc.close()
+                    
+        fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1])
+
+        ax=ax.flatten()
+                
+        for fname,f_idx in zip(acceptable_files, range(0,len(acceptable_files))):      
+                
+            nc = Dataset(os.path.join(root,fname), mode = 'r')
+                
+            time_var = nc.variables["TIME"]
+            
+            time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+        
+            time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+            
+            time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+            
+            #print('using '+fname)
+            
+            temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) | (time < time_recovery)])
+            
+            # Calculate temperature changes
+            nc_temp_diffs = np.diff(temp_extract)
+            
+            # Extract the time data
+            nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)])
+        
+            # Convert from days to hours
+            nc_time_hr = nc_time*24
+            
+            # Calculate time changes
+            nc_time_hr_diffs = np.diff(nc_time_hr)
+            
+            # Calculate the rate of change of temperature wrt time
+            nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+            
+            ax[f_idx].scatter(nc_time,nc_dtemp_dtime)
+            
+            # Add the results for this netcdf to the record for the deployment
+            deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+            
+            nc.close()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From d0f4a8e76dd62fd049b4e27061c181365e0d97d2 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 24 Mar 2020 17:31:02 +1100
Subject: [PATCH 34/59] various bits

---
 ocean_dp/qc/agg_temp_plot.py                  |  23 ++-
 ocean_dp/qc/arg_tester.py                     |  38 ++++
 ocean_dp/qc/rate_of_change_test.py            |   4 +
 ocean_dp/qc/spike_test.py                     |  20 +-
 ocean_dp/qc/spike_test_ver_2.py               | 149 ++++++++++++++
 ocean_dp/qc/temp_diff_timeseries_from_fv00.py | 183 ++++++++++++++----
 6 files changed, 373 insertions(+), 44 deletions(-)
 create mode 100755 ocean_dp/qc/arg_tester.py
 create mode 100755 ocean_dp/qc/spike_test_ver_2.py

diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py
index 9902493..71ab427 100755
--- a/ocean_dp/qc/agg_temp_plot.py
+++ b/ocean_dp/qc/agg_temp_plot.py
@@ -43,6 +43,17 @@
 label_coords = (0.1, 0.8)
 label_method = 'axes fraction' 
 
+# cmap = colors.ListedColormap(['black','green','blue','red','orange'])
+
+# boundaries = [0,5,10,20,40,80]
+
+cmap = colors.ListedColormap(['blue','orange','red'])
+
+boundaries = [0,20,30,500]
+
+norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True)
+
+
 for i in set(np.array(ins_idx)):
 
     cur_temp = temp[ins_idx==i]
@@ -59,9 +70,17 @@
     # Calculate the rate of change of temperature wrt time
     cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs)
     
-    ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),cur_dtemp_dtime)),cmap='cool')
+    im = ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),np.abs(cur_dtemp_dtime))),cmap=cmap,norm=norm)
+    
+    #ax[i].set_title(,fontsize=10) 
+    
+    ax[i].annotate('Ins:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8)
+    
+    if i==27:
+        fig.colorbar(im)
+    
     
-    ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8)
+fig.colorbar(cmap)
     
 i=1
 fig, ax = plt.subplots()
diff --git a/ocean_dp/qc/arg_tester.py b/ocean_dp/qc/arg_tester.py
new file mode 100755
index 0000000..2bfe7b4
--- /dev/null
+++ b/ocean_dp/qc/arg_tester.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+def learn_arg_sys(text_in):
+    
+    print(text_in)
+
+# USE runfile('arg_tester.py', args='bing')
+
+
+#if __name__ == "__main__":
+    # usage is <file_name> <variable_name> <*args>
+learn_arg_sys(text_in=sys.argv[1])
\ No newline at end of file
diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py
index c3e1d9b..f4e187b 100755
--- a/ocean_dp/qc/rate_of_change_test.py
+++ b/ocean_dp/qc/rate_of_change_test.py
@@ -114,6 +114,8 @@ def roc_test(nc,*args,target_vars_in=[]):
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
                 nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+                
+                print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour')
                     
             # update the history attribute
             try:
@@ -152,6 +154,8 @@ def roc_test(nc,*args,target_vars_in=[]):
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
                 nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                
+                print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour')
                     
             # update the history attribute
             try:
diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py
index 154f439..a061c32 100755
--- a/ocean_dp/qc/spike_test.py
+++ b/ocean_dp/qc/spike_test.py
@@ -28,13 +28,13 @@
 
 
 # If files aren't specified, take all the IMOS*.nc files in the current folder
-def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+def spike_test_all_files(target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
     target_files = glob.glob('IMOS*.nc')
 
     spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
 
 
-def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
     
     # Loop through each files in target_files
     for current_file in target_files:
@@ -48,7 +48,7 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=
         spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
 
 
-def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
     
     # If target_vars aren't user specified, set it to all the variables of 
     # the current_file, removing unwanted variables
@@ -77,6 +77,8 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f
         
         var_data = np.array(nc.variables[current_var])
         
+        
+        
         print('checking '+current_var+' for high spikes')
         
         # Step through the data, one element at a time, starting from the 2nd element
@@ -85,8 +87,11 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f
             # Calculate the mean of the i-1 and i+1 elements
             shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
             
+            # Calculate the step changes
+            shoulder_diff = np.diff(var_data[i-1:i+2])
+            
             # Check for spike exceeding high threshold
-            if abs(var_data[i]-shoulder_mean) > thresh_high:
+            if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)):# & (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])):
                 
                 print('High spike found')
                 
@@ -125,14 +130,17 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f
             # Calculate the mean of the i-1 and i+1 elements
             shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
             
+            # Calculate the step changes
+            shoulder_diff = np.diff(var_data[i-1:i+2])
+            
             #print('shoulder mean is '+str(shoulder_mean))
             
-            abs_diff = abs(var_data[i]-shoulder_mean)
+            #abs_diff = abs(var_data[i]-shoulder_mean)
             
             #print('absolute difference is '+str(abs_diff))
             
             # Check for spike exceeding low threshold
-            if abs(var_data[i]-shoulder_mean) > thresh_low:
+            if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)): #& (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])):
                 
                 print('Low spike found')
                 
diff --git a/ocean_dp/qc/spike_test_ver_2.py b/ocean_dp/qc/spike_test_ver_2.py
new file mode 100755
index 0000000..68fe69e
--- /dev/null
+++ b/ocean_dp/qc/spike_test_ver_2.py
@@ -0,0 +1,149 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+# If files aren't specified, take all the IMOS*.nc files in the current folder
+def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    target_files = glob.glob('IMOS*.nc')
+
+    spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
+
+
+def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    
+    # Loop through each files in target_files
+    for current_file in target_files:
+        # Print each filename
+        print("input file %s" % current_file)
+
+        # Extract netcdf data into nc
+        nc = Dataset(current_file, mode="a")
+
+        # run the spike test
+        spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
+
+
+
+def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
+    
+    # If target_vars aren't user specified, set it to all the variables of 
+    # the current_file, removing unwanted variables
+    if target_vars_in == []:
+        
+        target_vars = list(nc.variables.keys())
+        
+        # Remove TIME
+        target_vars.remove('TIME')
+        
+        # Remove any quality_control variables
+        qc_vars = [s for s in target_vars if 'quality_control' in s]
+        target_vars = [s for s in target_vars if s not in qc_vars]
+                        
+        # Remove any variables of single length
+        single_vars = [s for s in target_vars if nc.variables[s].size==1]
+        target_vars = [s for s in target_vars if s not in single_vars]
+        
+        print('target_vars are '+' '.join(target_vars))
+        
+    else:
+        target_vars = target_vars_in
+        
+    # For each variable, extract the data 
+    for current_var in target_vars:
+        
+        var_data = np.array(nc.variables[current_var])
+        
+        print('checking '+current_var+' for high spikes')
+        
+        # Step through the data, one element at a time, starting from the 2nd element
+        for i in range(1,(len(var_data)-1)):
+            
+            # Calculate the mean of the i-1 and i+1 elements
+            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
+            
+            # Calculate the step changes
+            shoulder_diff = np.diff(var_data[i-1:i+2])
+            
+            # Check for spike exceeding high threshold
+            if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)):
+                
+                print('High spike found')
+                
+                #set corresponding QC value to...
+                nc.variables[current_var+'_quality_control'][i] = flag_high
+        
+        print('checking '+current_var+' for low spikes')
+        
+        # For each of the remaining indices
+        for i in low_spike_chk_idx:
+            
+            #print('i is '+str(i))
+            
+            # Calculate the mean of the i-1 and i+1 elements
+            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
+            
+            # Calculate the step changes
+            shoulder_diff = np.diff(var_data[i-1:i+2])
+            
+            #print('shoulder mean is '+str(shoulder_mean))
+            
+            abs_diff = abs(var_data[i]-shoulder_mean)
+            
+            #print('absolute difference is '+str(abs_diff))
+            
+            # Check for spike exceeding low threshold
+            if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)):
+                
+                print('Low spike found')
+                
+                #set corresponding QC value to...
+                nc.variables[current_var+'_quality_control'][i] = flag_low        
+
+        # # Extract the qc data         
+        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
+        
+        # # Find all the instances of consecutive 3s, and reset them to 0        
+        # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]:
+
+        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0     
+
+
+    # update the history attribute
+    try:
+        hist = nc.history + "\n"
+    except AttributeError:
+        hist = ""
+
+    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low))
+
+    nc.close()
+    
+if __name__ == "__main__":
+    # usage is <file_name> <variable_name> <window> <flag value>
+    spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6]))
+
+    
\ No newline at end of file
diff --git a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py
index b21a605..9a38ccf 100755
--- a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py
+++ b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py
@@ -44,11 +44,19 @@ def sp_layout(num_in):
 
 
 
+cmap = colors.ListedColormap(['blue','orange','red'])
+
+boundaries = [0,20,30,2000]
+
+norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True)
+
+
+
 deployments = []
 
-for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+for x in next(os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"))[1]:
     
-    if ('Pulse' in x) or ('SOFS' in x):
+    if (('Pulse' in x) or ('SOFS' in x)):
 
         
         deployments.append(x)
@@ -64,6 +72,8 @@ def sp_layout(num_in):
     
     acceptable_files = []
     
+    acceptable_depths = []
+    
     print('current deployment is '+current_deployment)
     
     deployment_dtemp_dtime = np.array([])
@@ -82,57 +92,158 @@ def sp_layout(num_in):
             
                 if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
                 
-                    acceptable_files.append(fname)
+                    acceptable_files.append(os.path.join(root,fname))
+                    
+                    acceptable_depths.append(nc.instrument_nominal_depth)
                     
                     print(fname+' accepted')
                 
                 nc.close()
-                    
-        fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1])
-
-        ax=ax.flatten()
                 
-        for fname,f_idx in zip(acceptable_files, range(0,len(acceptable_files))):      
+    
+    acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))]            
                 
-            nc = Dataset(os.path.join(root,fname), mode = 'r')
+                    
+    fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1],figsize=((12,8)),constrained_layout=True,sharex=True)
+
+    ax=ax.flatten()
+    
+    fig.suptitle(current_deployment, fontsize=14)
+    
                 
-            time_var = nc.variables["TIME"]
-            
-            time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
-        
-            time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
-            
-            time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+    for fpath,f_idx in zip(acceptable_files, range(0,len(acceptable_files))):      
             
-            #print('using '+fname)
+        nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!?
             
-            temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) | (time < time_recovery)])
-            
-            # Calculate temperature changes
-            nc_temp_diffs = np.diff(temp_extract)
-            
-            # Extract the time data
-            nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)])
+        time_var = nc.variables["TIME"]
         
-            # Convert from days to hours
-            nc_time_hr = nc_time*24
-            
-            # Calculate time changes
-            nc_time_hr_diffs = np.diff(nc_time_hr)
-            
-            # Calculate the rate of change of temperature wrt time
-            nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+        time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+    
+        time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+        
+        time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+        
+        #print('using '+fname)
+        
+        temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)])
+        
+        # Calculate temperature changes
+        nc_temp_diffs = np.diff(temp_extract)
+        
+        # Extract the time data
+        nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)])
+    
+        # Convert from days to hours
+        nc_time_hr = nc_time*24
+        
+        # Calculate time changes
+        nc_time_hr_diffs = np.diff(nc_time_hr)
+        
+        # Calculate the rate of change of temperature wrt time
+        nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+        
+        im=ax[f_idx].scatter(nc_time,temp_extract,s=0.2,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime))),cmap=cmap,norm=norm)
+        
+        im=ax[f_idx].scatter(nc_time[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],temp_extract[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],s=0.5,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],cmap=cmap,norm=norm)
+        
+        ax[f_idx].set_title(str(nc.instrument_nominal_depth)+'m',fontsize=10)
+        
+        # Add the results for this netcdf to the record for the deployment
+        deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+        
+        nc.close()
+        
+        if f_idx==0:
             
-            ax[f_idx].scatter(nc_time,nc_dtemp_dtime)
+            fig.colorbar(im)
             
-            # Add the results for this netcdf to the record for the deployment
-            deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+    for f_idx in range(len(acceptable_files),len(ax)):       
             
-            nc.close()
+        ax[f_idx].set_axis_off()
 
 
+##############################################################################
 
+fig, ax = plt.subplots(sp_layout(len(deployments))[0],sp_layout(len(deployments))[1],figsize=((12,8)),constrained_layout=True,sharex=False)
 
+ax=ax.flatten()
+
+for current_deployment,d_idx in zip(deployments,range(0,len(deployments))):
+    
+    acceptable_files = []
+    
+    acceptable_depths = []
+    
+    print('current deployment is '+current_deployment)
+    
+    deployment_dtemp_dtime = np.array([])
+    
+    for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+        for fname in files:
+            
+            print('checking '+fname)
+          
+            if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname:
+                
+                print('opening '+fname)
+                
+                nc = Dataset(os.path.join(root,fname), mode = 'r')
+            
+                if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                
+                    acceptable_files.append(os.path.join(root,fname))
+                    
+                    acceptable_depths.append(nc.instrument_nominal_depth)
+                    
+                    print(fname+' accepted')
+                
+                nc.close()
+                
+    
+    acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))]            
+                    
+                
+    for fpath in acceptable_files:      
+            
+        nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!?
+            
+        time_var = nc.variables["TIME"]
+        
+        time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+    
+        time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+        
+        time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+        
+        #print('using '+fname)
+        
+        temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)])
+        
+        # Calculate temperature changes
+        nc_temp_diffs = np.diff(temp_extract)
+        
+        # Extract the time data
+        nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)])
+    
+        # Convert from days to hours
+        nc_time_hr = nc_time*24
+        
+        # Calculate time changes
+        nc_time_hr_diffs = np.diff(nc_time_hr)
+        
+        # Calculate the rate of change of temperature wrt time
+        nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+        
+        # Add the results for this netcdf to the record for the deployment
+        deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime))
+        
+        nc.close()
+        
+    ax[d_idx].hist(deployment_dtemp_dtime,21,log=True)
+    
+    ax[d_idx].set_title(current_deployment,fontsize=10)
+        
 
 
 

From 5b390327300e528da818f7b281746b248a63f407 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 21 Apr 2020 17:15:06 +1000
Subject: [PATCH 35/59] QC and preprocessing

---
 ocean_dp/aggregation/copyDataset.py          |  73 ++++-----
 ocean_dp/processing/pressure_interpolator.py |   7 +-
 ocean_dp/qc/flatline_test.py                 |  27 +++-
 ocean_dp/qc/global_range.py                  |  69 +++++++--
 ocean_dp/qc/in_out_water.py                  |  66 ++++----
 ocean_dp/qc/rate_of_change_test.py           |  43 +++++-
 ocean_dp/qc/spike_test.py                    |  64 ++++----
 ocean_dp/qc/spike_test_ver_2.py              | 149 -------------------
 ocean_dp/sots_processing_runthrough.py       |  74 +++++++++
 9 files changed, 306 insertions(+), 266 deletions(-)
 delete mode 100755 ocean_dp/qc/spike_test_ver_2.py
 create mode 100755 ocean_dp/sots_processing_runthrough.py

diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py
index 95c5998..54eb960 100644
--- a/ocean_dp/aggregation/copyDataset.py
+++ b/ocean_dp/aggregation/copyDataset.py
@@ -38,7 +38,10 @@ def aggregate(files, varNames):
     # look over all files, create a time array from all files
     # TODO: maybe delete files here without variables we're not interested in
     # TODO: Create set of variables in all files
-
+    if not isinstance(varNames, list):
+        
+        varNames = [varNames]
+    
     filen = 0
     for path_file in files:
 
@@ -351,56 +354,56 @@ def aggregate(files, varNames):
     return outputName
 
 
-def collect_vars_to_agg(files):
+# def collect_vars_to_agg(files):
 
-    var_list = []
+#     var_list = []
 
-    nc = Dataset(files[0])
-    varList = nc.variables
+#     nc = Dataset(files[0])
+#     varList = nc.variables
 
-    # default to all variables in first file should no variable be specified
-    var_list.extend(varList.keys())
-    var_list.remove("TIME")
+#     # default to all variables in first file should no variable be specified
+#     var_list.extend(varList.keys())
+#     var_list.remove("TIME")
 
-    nc.close()
+#     nc.close()
 
-    print("collect_vars_to_agg::", var_list)
+#     print("collect_vars_to_agg::", var_list)
 
-    return var_list
+#     return var_list
 
 
-if __name__ == "__main__":
+# if __name__ == "__main__":
 
-    files = []
-    varToAgg = None # defaults to all in first file
+#     files = []
+#     varToAgg = None # defaults to all in first file
 
-    if len(sys.argv) > 1:
-        parser = argparse.ArgumentParser()
-        parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)')
-        parser.add_argument('-f', dest='filelist', help='read file names from file')
-        parser.add_argument('file', nargs='*', help='input file name')
-        args = parser.parse_args()
+#     if len(sys.argv) > 1:
+#         parser = argparse.ArgumentParser()
+#         parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)')
+#         parser.add_argument('-f', dest='filelist', help='read file names from file')
+#         parser.add_argument('file', nargs='*', help='input file name')
+#         args = parser.parse_args()
 
-        if not isinstance(args.filelist, type(None)):
-            with open(args.filelist, "r") as ins:
-                for line in ins:
-                    print(line)
-                    files.append(line.strip())
+#         if not isinstance(args.filelist, type(None)):
+#             with open(args.filelist, "r") as ins:
+#                 for line in ins:
+#                     print(line)
+#                     files.append(line.strip())
 
-        if len(args.file):
-            # files = args.file
-            for fn in args.file:
-                files.extend(glob.glob(fn))
+#         if len(args.file):
+#             # files = args.file
+#             for fn in args.file:
+#                 files.extend(glob.glob(fn))
 
-        varToAgg = args.var
+#         varToAgg = args.var
 
-    if isinstance(varToAgg, type(None)):
-        varToAgg = collect_vars_to_agg(files)
+#     if isinstance(varToAgg, type(None)):
+#         varToAgg = collect_vars_to_agg(files)
 
-    print("Aggregating variables ", varToAgg)
+#     print("Aggregating variables ", varToAgg)
 
-    outputName = aggregate(files, varToAgg)
+#     outputName = aggregate(files, varToAgg)
 
-    print("Output file :  %s" % outputName)
+#     print("Output file :  %s" % outputName)
 
 
diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py
index af248c2..0473144 100755
--- a/ocean_dp/processing/pressure_interpolator.py
+++ b/ocean_dp/processing/pressure_interpolator.py
@@ -26,6 +26,8 @@
 
 def pressure_interpolator(netCDFfiles = [],agg = []):
     
+    files_out = []
+    
     if netCDFfiles==[]:
         
         print('netcdffiles = none')
@@ -65,6 +67,8 @@ def pressure_interpolator(netCDFfiles = [],agg = []):
         # a copy of the old file with the new name
         if fn_new != fn:
             
+            files_out.append(fn_new)
+            
             print('copying file')
             # copy file
             shutil.copy(fn, fn_new)
@@ -254,7 +258,8 @@ def pressure_interpolator(netCDFfiles = [],agg = []):
             fv01_contents.close()
             
     agg.close()
-            
+    
+    return files_out        
                 
                 
                 
diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
index e211cf4..a9e7d23 100755
--- a/ocean_dp/qc/flatline_test.py
+++ b/ocean_dp/qc/flatline_test.py
@@ -28,13 +28,13 @@
 
 
 # If files aren't specified, take all the IMOS*.nc files in the current folder
-def flatline_test_all_files(target_vars_in=[], window=3, flag=3):
+def flatline_test_all_files(target_vars_in=[], window=3, flag=4):
     target_files = glob.glob('IMOS*.nc')
 
     flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag)
 
 
-def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3):
+def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4):
     
     # Loop through each files in target_files
     for current_file in target_files:
@@ -48,7 +48,7 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3):
         flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag)
 
 
-def flatline_test(nc, target_vars_in=[], window=3, flag=3):
+def flatline_test(nc, target_vars_in=[], window=3, flag=4):
     
         # If target_vars aren't user specified, set it to all the variables of 
         # the current_file, removing unwanted variables
@@ -75,6 +75,21 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3):
         # For each variable, extract the data 
         for current_var in target_vars:
             
+            # Extract the variable
+            nc_var = nc.variables[current_var]
+            
+            if nc_var.name + "_quality_control_flt" in nc.variables:
+                ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"]
+            else:
+                ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                ncVarOut[:] = np.zeros(nc_var.shape)
+                ncVarOut.long_name = "quality flag for " + nc_var.name
+                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+        
+            # add new variable to list of aux variables
+            nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_flt"
+            
             var_data = np.array(nc.variables[current_var])
             
             print('checking '+current_var)
@@ -86,13 +101,17 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3):
                 if len(set(var_data[i:(i+window)])) == 1:
                     
                     # set corresponding QC value to...
-                    nc.variables[current_var+'_quality_control'][i:(i+window)] = flag
+                    nc.variables[current_var+'_quality_control_flt'][i:(i+window)] = flag
+                    
+            nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_flt"][:],nc.variables[current_var  + "_quality_control"][:])
 
         # update the history attribute
         try:
             hist = nc.history + "\n"
         except AttributeError:
             hist = ""
+            
+            
 
         nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
 
diff --git a/ocean_dp/qc/global_range.py b/ocean_dp/qc/global_range.py
index e11bf40..68e2ee0 100644
--- a/ocean_dp/qc/global_range.py
+++ b/ocean_dp/qc/global_range.py
@@ -28,29 +28,71 @@
 # flag 4 (bad) when out of global range
 
 
-def global_range(netCDFfile, variable, max, min):
+def global_range(netCDFfile, variable, max, min, qc_value=4):
     ds = Dataset(netCDFfile, 'a')
 
-    var = ds.variables[variable]
+    nc_var = ds.variables[variable]
+    var_data = nc_var[:]
+    var_data.mask = False
 
     try:
-        var_qc = ds.variables[variable + "_quality_control"]
+        # find the existing quality_control variable in the auxillary variables list
+        aux_vars = nc_var.ancillary_variables
+        aux_var = aux_vars.split(" ")
+        qc_vars = [i for i in aux_var if i.endswith("_quality_control")]
+        qc_var = qc_vars[0]
+        print("QC var name ", qc_var)
+        var_qc = ds.variables[qc_var]
     except KeyError:
         print("no QC variable found")
         return None
 
+    # read existing quality_control flags
+    qc = var_qc[:]
+
     # this is where the actual QC test is done
-    mask = ((var[:] > max) | (var[:] < min))
+    mask = ((var_data > max) | (var_data < min))
+    print('mask data ', mask)
+
+    # create a qc variable just for this test flags
+    if nc_var.name + "_quality_control_gr" in ds.variables:
+        ncVarOut = ds.variables[nc_var.name + "_quality_control_gr"]
+    else:
+        ncVarOut = ds.createVariable(nc_var.name + "_quality_control_gr", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+        ncVarOut[:] = np.zeros(nc_var.shape)
+        ncVarOut.long_name = "quality flag for " + nc_var.name
+        ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+        ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+
+    # add new variable to list of aux variables
+    nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_gr"
+
+    # store the qc flags
+    ncVarOut[mask] = qc_value
 
-    mask = mask & (var_qc[:] < 1) # only mark data that has not been QCd already
+    # store qc flags to main quality_control flags variable
+    mask = mask & (qc < 1)  # only mark data that has not been QCd already
+    print('mask other qc ', mask)
 
-    var_qc[mask] = 4
-    count = sum(mask)
-    print('marked records ', count)
+    qc[mask] = qc_value  # mark the out of range points with bad_data
+
+    # calculate the number of points marked as bad_data
+    marked = np.zeros_like(qc)
+    marked[mask] = 1
+    count = sum(marked)
+    print('marked records ', count, mask, qc)
+
+    # write flags back to main QC variable
+    var_qc[:] = qc
 
     # update the history attribute
-    history = ds.history
-    ds.setncattr("history", history + "\n" + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count))
+    try:
+        hist = ds.history + "\n"
+    except AttributeError:
+        hist = ""
+    ds.setncattr("history", hist + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count))
+
+    ds.variables[variable + "_quality_control"][:] = np.maximum(ds.variables[variable + "_quality_control_gr"][:],ds.variables[variable + "_quality_control"][:])
 
     ds.close()
 
@@ -59,5 +101,8 @@ def global_range(netCDFfile, variable, max, min):
 
 if __name__ == "__main__":
 
-    # usage is <file_name> <variable_name> <max> <min>
-    global_range(sys.argv[1], sys.argv[2], float(sys.argv[3]), float(sys.argv[4]))
+    # usage is <file_name> <variable_name> <max> <min> <qc value>
+    if len(sys.argv) > 5:
+        global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4]), qc_value=int(sys.argv[5]))
+    else:
+        global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4]))
\ No newline at end of file
diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py
index 2543c23..e8635ed 100644
--- a/ocean_dp/qc/in_out_water.py
+++ b/ocean_dp/qc/in_out_water.py
@@ -27,49 +27,56 @@
 # flag out of water as QC value 7 (not_deployed), with wise leave as 0
 
 
-def in_out_water(netCDFfile):
+def in_out_water(netCDFfile, var_name=None):
     ds = Dataset(netCDFfile, 'a')
 
-    vars = ds.variables
-
+    nc_vars = ds.variables
     to_add = []
-    for v in vars:
-        #print (vars[v].dimensions)
-        if v != 'TIME':
-            to_add.append(v)
-
-    time_var = vars["TIME"]
+    if var_name:
+        to_add.append(var_name)
+    else:
+        for v in nc_vars:
+            #print (vars[v].dimensions)
+            if v != 'TIME':
+                to_add.append(v)
+
+    time_var = nc_vars["TIME"]
     time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
 
     time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
     time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
 
-    print(time_deploy)
+    print('deployment time', time_deploy)
 
     print(to_add)
-    for v in to_add:
-        if "TIME" in vars[v].dimensions:
 
-            if v.endswith("_quality_control"):
+    # create a mask for the time range
+    mask = (time <= time_deploy) | (time >= time_recovery)
 
+    for v in to_add:
+        if "TIME" in nc_vars[v].dimensions:
+            if v.endswith("_quality_control"):
                 print("QC time dim ", v)
 
-                ncVarOut = vars[v]
-                mask = (time <= time_deploy) | (time >= time_recovery)
-                ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7
-
+                ncVarOut = nc_vars[v]
+                ncVarOut[mask] = 7
+            else:
+                # create a qc variable just for this test flags
+                if v + "_quality_control_io" in ds.variables:
+                    ncVarOut = ds.variables[v + "_quality_control_io"]
+                else:
+                    ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                ncVarOut[:] = np.zeros(nc_vars[v].shape)
+                ncVarOut.long_name = "quality flag for " + v
+                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+
+                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
+                ncVarOut[mask] = 7
+        
+        ds.variables[v + "_quality_control"][:] = np.maximum(ds.variables[v + "_quality_control_io"][:],ds.variables[v + "_quality_control"][:])
 
     ds.file_version = "Level 1 - Quality Controlled Data"
-    
-    # update the history attribute
-    try:
-        hist = nc.history + "\n"
-        
-    except AttributeError:
-        hist = ""
-    
-    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7')        
-            
 
     ds.close()
 
@@ -77,4 +84,7 @@ def in_out_water(netCDFfile):
 
 
 if __name__ == "__main__":
-    in_out_water(sys.argv[1])
+    if len(sys.argv) > 2 & sys.argv[1].startswith('-'):
+        in_out_water(sys.argv[2], var_name=sys.argv[1][1:])
+    else:
+        in_out_water(sys.argv[1])
\ No newline at end of file
diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py
index f4e187b..d889b1f 100755
--- a/ocean_dp/qc/rate_of_change_test.py
+++ b/ocean_dp/qc/rate_of_change_test.py
@@ -106,14 +106,31 @@ def roc_test(nc,*args,target_vars_in=[]):
             # For each variable
             for current_var in target_vars:
                 
-                # Extract the data
-                var_data = np.array(nc.variables[current_var])
+                # Extract the variable
+                nc_var = nc.variables[current_var]
+                
+                if nc_var.name + "_quality_control_roc" in nc.variables:
+                    ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"]
+                else:
+                    ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                    ncVarOut[:] = np.zeros(nc_var.shape)
+                    ncVarOut.long_name = "quality flag for " + nc_var.name
+                    ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                    ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+            
+                # add new variable to list of aux variables
+                nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
+                
+                # Extract the variable data
+                var_data = np.array(nc.variables[current_var][:])
                 
                 # Calculate dvar/dtime
                 var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
-                nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+                nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+                
+                nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_roc"][:],nc.variables[current_var  + "_quality_control"][:])
                 
                 print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour')
                     
@@ -146,6 +163,22 @@ def roc_test(nc,*args,target_vars_in=[]):
             # For each variable
             for current_var in target_vars:
                 
+                # Extract the variable
+                nc_var = nc.variables[current_var]
+                
+                if nc_var.name + "_quality_control_roc" in nc.variables:
+                    ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"]
+                else:
+                    ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                    ncVarOut[:] = np.zeros(nc_var.shape)
+                    ncVarOut.long_name = "quality flag for " + nc_var.name
+                    ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                    ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+            
+                # add new variable to list of aux variables
+                nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
+                                
+                
                 # Extract the data
                 var_data = np.array(nc.variables[current_var])
                 
@@ -153,7 +186,9 @@ def roc_test(nc,*args,target_vars_in=[]):
                 var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
-                nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                
+                nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_roc"][:],nc.variables[current_var  + "_quality_control"][:])
                 
                 print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour')
                     
diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py
index a061c32..622d3a3 100755
--- a/ocean_dp/qc/spike_test.py
+++ b/ocean_dp/qc/spike_test.py
@@ -26,15 +26,19 @@
 import pytz
 import os
 
+default_high = 100
+
+default_low = 50
+
 
 # If files aren't specified, take all the IMOS*.nc files in the current folder
-def spike_test_all_files(target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
+def spike_test_all_files(target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4):
     target_files = glob.glob('IMOS*.nc')
 
     spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
 
 
-def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
+def spike_test_files(target_files, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4):
     
     # Loop through each files in target_files
     for current_file in target_files:
@@ -48,7 +52,7 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high
         spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
 
 
-def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4):
+def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4):
     
     # If target_vars aren't user specified, set it to all the variables of 
     # the current_file, removing unwanted variables
@@ -72,12 +76,27 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3,
     else:
         target_vars = target_vars_in
         
-    # For each variable, extract the data 
+    # For each variable
     for current_var in target_vars:
         
-        var_data = np.array(nc.variables[current_var])
-        
-        
+        # Extract the variable
+        nc_var = nc.variables[current_var]
+        
+        # Create a test specific qc variable if it  doesn't already exist
+        if nc_var.name + "_quality_control_spk" in nc.variables:
+            ncVarOut = nc.variables[nc_var.name + "_quality_control_spk"]
+        else:
+            ncVarOut = nc.createVariable(nc_var.name + "_quality_control_spk", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+            ncVarOut[:] = np.zeros(nc_var.shape)
+            ncVarOut.long_name = "quality flag for " + nc_var.name
+            ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+            ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+    
+        # add new variable to list of aux variables
+        nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_spk"
+                
+        # Extract the variable data
+        var_data = np.array(nc.variables[current_var][:])
         
         print('checking '+current_var+' for high spikes')
         
@@ -96,20 +115,12 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3,
                 print('High spike found')
                 
                 #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control'][i] = flag_high
+                nc.variables[current_var+'_quality_control_spk'][i] = flag_high
                 
-        # # Extract the qc data         
-        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
-        
-        # # Find all the instances of consecutive 4s, and reset them to 0        
-        # for i in np.where(current_qc==4)[0][0:-1][np.diff(np.where(current_qc==4)[0])==1]:
-
-        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0       
         
         # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike
         low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1]
-        
-        #print(low_spike_chk_idx)
+
         
         # Remove from the indices those that are either side of a high spike
         for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]:
@@ -133,28 +144,15 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3,
             # Calculate the step changes
             shoulder_diff = np.diff(var_data[i-1:i+2])
             
-            #print('shoulder mean is '+str(shoulder_mean))
-            
-            #abs_diff = abs(var_data[i]-shoulder_mean)
-            
-            #print('absolute difference is '+str(abs_diff))
-            
             # Check for spike exceeding low threshold
             if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)): #& (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])):
                 
                 print('Low spike found')
                 
                 #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control'][i] = flag_low        
-
-        # # Extract the qc data         
-        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
-        
-        # # Find all the instances of consecutive 3s, and reset them to 0        
-        # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]:
-
-        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0     
-
+                nc.variables[current_var+'_quality_control_spk'][i] = flag_low   
+                
+    nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_spk"][:],nc.variables[current_var  + "_quality_control"][:])
 
     # update the history attribute
     try:
diff --git a/ocean_dp/qc/spike_test_ver_2.py b/ocean_dp/qc/spike_test_ver_2.py
deleted file mode 100755
index 68fe69e..0000000
--- a/ocean_dp/qc/spike_test_ver_2.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (C) 2020 Ben Weeding
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-import re
-from datetime import datetime, timedelta
-from netCDF4 import num2date, date2num
-from netCDF4 import stringtochar
-import numpy.ma as ma
-import sys
-from netCDF4 import Dataset
-import numpy as np
-import argparse
-import glob
-import pytz
-import os
-
-# If files aren't specified, take all the IMOS*.nc files in the current folder
-def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
-    target_files = glob.glob('IMOS*.nc')
-
-    spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
-
-
-def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
-    
-    # Loop through each files in target_files
-    for current_file in target_files:
-        # Print each filename
-        print("input file %s" % current_file)
-
-        # Extract netcdf data into nc
-        nc = Dataset(current_file, mode="a")
-
-        # run the spike test
-        spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high)
-
-
-
-def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4):
-    
-    # If target_vars aren't user specified, set it to all the variables of 
-    # the current_file, removing unwanted variables
-    if target_vars_in == []:
-        
-        target_vars = list(nc.variables.keys())
-        
-        # Remove TIME
-        target_vars.remove('TIME')
-        
-        # Remove any quality_control variables
-        qc_vars = [s for s in target_vars if 'quality_control' in s]
-        target_vars = [s for s in target_vars if s not in qc_vars]
-                        
-        # Remove any variables of single length
-        single_vars = [s for s in target_vars if nc.variables[s].size==1]
-        target_vars = [s for s in target_vars if s not in single_vars]
-        
-        print('target_vars are '+' '.join(target_vars))
-        
-    else:
-        target_vars = target_vars_in
-        
-    # For each variable, extract the data 
-    for current_var in target_vars:
-        
-        var_data = np.array(nc.variables[current_var])
-        
-        print('checking '+current_var+' for high spikes')
-        
-        # Step through the data, one element at a time, starting from the 2nd element
-        for i in range(1,(len(var_data)-1)):
-            
-            # Calculate the mean of the i-1 and i+1 elements
-            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
-            
-            # Calculate the step changes
-            shoulder_diff = np.diff(var_data[i-1:i+2])
-            
-            # Check for spike exceeding high threshold
-            if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)):
-                
-                print('High spike found')
-                
-                #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control'][i] = flag_high
-        
-        print('checking '+current_var+' for low spikes')
-        
-        # For each of the remaining indices
-        for i in low_spike_chk_idx:
-            
-            #print('i is '+str(i))
-            
-            # Calculate the mean of the i-1 and i+1 elements
-            shoulder_mean = np.mean(np.take(var_data,[i-1,i+1]))
-            
-            # Calculate the step changes
-            shoulder_diff = np.diff(var_data[i-1:i+2])
-            
-            #print('shoulder mean is '+str(shoulder_mean))
-            
-            abs_diff = abs(var_data[i]-shoulder_mean)
-            
-            #print('absolute difference is '+str(abs_diff))
-            
-            # Check for spike exceeding low threshold
-            if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)):
-                
-                print('Low spike found')
-                
-                #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control'][i] = flag_low        
-
-        # # Extract the qc data         
-        # current_qc = np.array(nc.variables[current_var+'_quality_control'][:])      
-        
-        # # Find all the instances of consecutive 3s, and reset them to 0        
-        # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]:
-
-        #     nc.variables[current_var+'_quality_control'][i:i+2] = 0     
-
-
-    # update the history attribute
-    try:
-        hist = nc.history + "\n"
-    except AttributeError:
-        hist = ""
-
-    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low))
-
-    nc.close()
-    
-if __name__ == "__main__":
-    # usage is <file_name> <variable_name> <window> <flag value>
-    spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6]))
-
-    
\ No newline at end of file
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
new file mode 100755
index 0000000..491f3a5
--- /dev/null
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Initial package import
+import sys
+import glob
+import fnmatch
+import os
+
+# Addition of folder containing user defined packages/modules
+sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/qc')
+sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/aggregation')
+sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/processing')
+
+# Import user defined packages
+import add_qc_flags
+import in_out_water
+import copyDataset
+import pressure_interpolator
+
+import global_range
+import rate_of_change_test
+
+# Set the working directory
+os.chdir('/Users/tru050/Desktop/sofs7.5 test data')
+
+# Make a list of FV00 filenames
+fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc')
+
+# Run add_qc_flags.py and collect FV01 filenames
+fv01_files = add_qc_flags.add_qc(fv00_files)
+
+# Run in_out_water.py
+for ncfile in fv01_files:
+    
+    in_out_water.in_out_water(ncfile,var_name='TEMP')
+
+# Select pressure files using matching = fnmatch.filter(sofs75filesfv01,'*SOTS*P*_2*.nc')
+pres_files = fnmatch.filter(fv01_files,'*IMOS_ABOS-SOTS*P*_2*FV01*.nc')
+
+# Run copyDataset.py
+copyDataset.aggregate(pres_files,'PRES')
+
+# Run pressure_interpolator.py
+fv01_pres_interp_files = pressure_interpolator.pressure_interpolator(netCDFfiles=fv01_files,agg=glob.glob('*IMOS_ABOS-SOTS*Aggregate*.nc')[0])
+
+# Global range test
+for ncfile in fv01_pres_interp_files:
+    
+    print(ncfile)
+    
+    global_range.global_range(ncfile,'TEMP',40,-2)
+
+# Rate of change
+rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20)
+
+# Spike
+
+
+# Flatline
+
+

From 07b40fdd30982cd558448c4adec61e3cc8f424aa Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 22 Apr 2020 14:03:52 +1000
Subject: [PATCH 36/59] Qc checker and variable selector

---
 ocean_dp/qc/qc_checker.py              | 116 +++++++++++++++++++++++++
 ocean_dp/qc/rate_of_change_test.py     |  13 +--
 ocean_dp/sots_processing_runthrough.py |  17 +++-
 3 files changed, 134 insertions(+), 12 deletions(-)
 create mode 100755 ocean_dp/qc/qc_checker.py

diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py
new file mode 100755
index 0000000..5d8720d
--- /dev/null
+++ b/ocean_dp/qc/qc_checker.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import re
+from datetime import datetime, timedelta
+from netCDF4 import num2date, date2num
+from netCDF4 import stringtochar
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+
+# Code is design to check that netcdf files processed using the SOTS methods 
+# conform to the QC labelling designed by Peter Jansen.
+
+def qc_checker_all_files(target_vars_in=[]):
+    
+    successful_files=[]
+    
+    target_files = glob.glob('IMOS*.nc')
+
+    successful_files = qc_checker_files(target_files, target_vars_in=target_vars_in)
+    
+
+def qc_checker_files(target_files,target_vars_in=[]):
+    
+    successful_files=[]
+    
+    # Loop through each files in target_files
+    for current_file in target_files:
+        # Print each filename
+        print("input file %s" % current_file)
+
+        # Extract netcdf data into nc
+        nc = Dataset(current_file, mode="a")
+
+        # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items
+        if qc_checker(nc,target_vars_in=target_vars_in):
+        
+            successful_files.append(current_file)
+        
+    return successful_files
+
+
+# Enter args as variable name and rate of change limit, ie. 'TEMP',4
+def qc_checker(nc,target_vars_in=[]):
+    
+    all_vars = list(nc.variables.keys())
+    
+    # If target_vars aren't user specified, set it to all the variables of 
+    # the current_file, removing unwanted variables (qc, single length, TIME)
+    if target_vars_in == []:
+                
+        target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1]
+        
+        print('target_vars are '+' '.join(target_vars))
+        
+    else:
+        target_vars = target_vars_in    
+        
+    qc_behaving = True    
+        
+    for current_var in target_vars:
+        
+        qc_global_data = np.array(nc.variables[current_var+"_quality_control"][:])
+        
+        qc_test_specific = [s for s in all_vars if current_var+"_quality_control" in s and not s.endswith('control')]
+        
+        for current_qc_test in qc_test_specific:
+            
+            qc_test_data = np.array(nc.variables[current_qc_test])
+            
+            #print('checking '+current_qc_test)
+            
+            # If true, fail process
+            if any(np.less(qc_global_data,qc_test_data)):
+                
+                print(current_qc_test + "failed")
+                
+                qc_behaving = False
+                
+    if qc_behaving:
+        
+        return True
+    
+
+    # Close the current netcdf file
+    nc.close()
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
\ No newline at end of file
diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py
index d889b1f..a7f54d1 100755
--- a/ocean_dp/qc/rate_of_change_test.py
+++ b/ocean_dp/qc/rate_of_change_test.py
@@ -79,18 +79,9 @@ def roc_test(nc,*args,target_vars_in=[]):
             # the current_file, removing unwanted variables
             if target_vars_in == []:
                 
-                target_vars = list(nc.variables.keys())
+                all_vars = list(nc.variables.keys())
                 
-                # Remove TIME
-                target_vars.remove('TIME')
-                
-                # Remove any quality_control variables
-                qc_vars = [s for s in target_vars if 'quality_control' in s]
-                target_vars = [s for s in target_vars if s not in qc_vars]
-                                
-                # Remove any variables of single length
-                single_vars = [s for s in target_vars if nc.variables[s].size==1]
-                target_vars = [s for s in target_vars if s not in single_vars]
+                target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1]
                 
                 print('target_vars are '+' '.join(target_vars))
                 
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index 491f3a5..474eb3a 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -18,6 +18,7 @@
 import glob
 import fnmatch
 import os
+import time
 
 # Addition of folder containing user defined packages/modules
 sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/qc')
@@ -32,6 +33,11 @@
 
 import global_range
 import rate_of_change_test
+import spike_test
+import flatline_test
+import qc_checker
+
+start = time.time()
 
 # Set the working directory
 os.chdir('/Users/tru050/Desktop/sofs7.5 test data')
@@ -67,8 +73,17 @@
 rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20)
 
 # Spike
-
+spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP'])
 
 # Flatline
+flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'])
+
+# Check qc process has worked
+fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP'])
+
+end = time.time()
+
+print('time elapsed: '+end-start)
+
 
 

From a6bb5603206ca99784be304a4e20d676dd64e7fc Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 22 Apr 2020 14:23:55 +1000
Subject: [PATCH 37/59] commented qc checker

---
 ocean_dp/qc/qc_checker.py              | 28 +++++++++++++++++---------
 ocean_dp/sots_processing_runthrough.py |  2 +-
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py
index 5d8720d..e229d72 100755
--- a/ocean_dp/qc/qc_checker.py
+++ b/ocean_dp/qc/qc_checker.py
@@ -36,6 +36,7 @@ def qc_checker_all_files(target_vars_in=[]):
     
     target_files = glob.glob('IMOS*.nc')
 
+    # Returns the files that conform to the qc labelling
     successful_files = qc_checker_files(target_files, target_vars_in=target_vars_in)
     
 
@@ -51,17 +52,19 @@ def qc_checker_files(target_files,target_vars_in=[]):
         # Extract netcdf data into nc
         nc = Dataset(current_file, mode="a")
 
-        # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items
+        # If the qc_checker was successfull, add the filename to the list
         if qc_checker(nc,target_vars_in=target_vars_in):
         
             successful_files.append(current_file)
-        
+    
+    # Returns the files that conform to the qc labelling    
     return successful_files
 
 
 # Enter args as variable name and rate of change limit, ie. 'TEMP',4
 def qc_checker(nc,target_vars_in=[]):
     
+    # Collect all the variables from the netcdf
     all_vars = list(nc.variables.keys())
     
     # If target_vars aren't user specified, set it to all the variables of 
@@ -74,31 +77,38 @@ def qc_checker(nc,target_vars_in=[]):
         
     else:
         target_vars = target_vars_in    
-        
-    qc_behaving = True    
-        
+           
+    # For each of the variables selected    
     for current_var in target_vars:
         
+        # Collect the global qc data
         qc_global_data = np.array(nc.variables[current_var+"_quality_control"][:])
         
+        # Collect the names of all the other test specific qc vectors
         qc_test_specific = [s for s in all_vars if current_var+"_quality_control" in s and not s.endswith('control')]
         
+        # For each of the other test specific qc vectors
         for current_qc_test in qc_test_specific:
             
+            # Extract the data
             qc_test_data = np.array(nc.variables[current_qc_test])
             
             #print('checking '+current_qc_test)
             
-            # If true, fail process
+            # If any of the test specific qc vectors ever have a great value than the global qc vector at a timestamp, the qc process has failed
             if any(np.less(qc_global_data,qc_test_data)):
                 
                 print(current_qc_test + "failed")
                 
                 qc_behaving = False
                 
-    if qc_behaving:
-        
-        return True
+            else:
+                
+                # The qc process has succeeded 
+                qc_behaving = True
+    
+    # Returns true if qc has succeeded, false if not
+    return qc_behaving
     
 
     # Close the current netcdf file
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index 474eb3a..6c8bd90 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -83,7 +83,7 @@
 
 end = time.time()
 
-print('time elapsed: '+end-start)
+print('time elapsed: '+str(end-start))
 
 
 

From f97718bcd8264cf0450c8bd19d8cfe85c5b7af74 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 27 Apr 2020 17:00:20 +1000
Subject: [PATCH 38/59] Changed tests to update netcdfs correctly when
 reprocessing data

---
 ocean_dp/plotting/plotNetCDFmultiqc.py | 340 +++++++++++++++++++++++++
 ocean_dp/qc/flatline_test.py           | 145 ++++++-----
 ocean_dp/qc/global_range.py            |   3 +-
 ocean_dp/qc/qc_checker.py              |   6 +-
 ocean_dp/qc/rate_of_change_test.py     |  57 ++++-
 ocean_dp/qc/spike_test.py              |   8 +-
 ocean_dp/sots_processing_runthrough.py |   6 +-
 7 files changed, 479 insertions(+), 86 deletions(-)
 create mode 100755 ocean_dp/plotting/plotNetCDFmultiqc.py

diff --git a/ocean_dp/plotting/plotNetCDFmultiqc.py b/ocean_dp/plotting/plotNetCDFmultiqc.py
new file mode 100755
index 0000000..9442a56
--- /dev/null
+++ b/ocean_dp/plotting/plotNetCDFmultiqc.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python3
+
+# raw2netCDF
+# Copyright (C) 2019 Peter Jansen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from netCDF4 import Dataset
+from netCDF4 import num2date
+import datetime as dt
+import numpy as np
+import matplotlib
+
+matplotlib.use('Agg')
+
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import sys
+import os
+from matplotlib import rc
+
+# rc('text', usetex=True)
+
+for path_file in sys.argv[1:len(sys.argv)]:
+
+    nc = Dataset(path_file)
+
+    # get time variable
+    nctime = nc.variables['TIME'][:]
+    t_unit = nc.variables['TIME'].units  # get unit  "days since 1950-01-01T00:00:00Z"
+
+    try:
+        t_cal = nc.variables['TIME'].calendar
+    except AttributeError:  # Attribute doesn't exist
+        t_cal = u"gregorian"  # or standard
+
+    dt_time = [num2date(t, units=t_unit, calendar=t_cal) for t in nctime]
+
+    # work out variables to plot
+    nc_vars_to_plot = [var for var in nc.variables]
+
+    # remove any dimensions from the list to plot
+    nc_dims = [dim for dim in nc.dimensions]  # list of nc dimensions
+
+    for i in nc_dims:
+        try:
+            nc_vars_to_plot.remove(i)
+        except ValueError:
+            print('did not remove ', i)
+
+    # remove an auxiliary variables from the list to plot
+    aux_vars = list()
+    for var in nc.variables:
+        try:
+            aux_vars.append(nc.variables[var].getncattr('ancillary_variables'))
+        except AttributeError:
+            pass
+
+    # remove any variables without a TIME dimension from the list to plot
+    to_plot = list()
+
+    for var in nc.variables:
+        # print var
+        if var in nc_dims:
+            continue
+        if var in aux_vars:
+            continue
+        if 'TIME' in nc.variables[var].dimensions:
+            print('to plot ', var)
+            to_plot.append(var)
+
+    # pdffile = path_file[path_file.rfind('/')+1:len(path_file)] + '-' + nc.getncattr('deployment_code') + '-plot.pdf'
+
+    pdffile = path_file + '.pdf'
+
+    pp = PdfPages(pdffile)
+
+    txt = ""
+    lines = 0
+    plt.figure(figsize=(11.69, 8.27))
+
+    txt = 'file name : ' + os.path.basename(path_file) + '\n\n'
+
+    txt += 'Dimensions:\n'
+    for x in nc.dimensions:
+        txt += '    ' + x + ' (' + str(nc.dimensions[x].size) + ')\n'
+
+    txt += '\nVariables:\n'
+    for x in nc.variables:
+        v_atts = nc.variables[x]
+        var_line = '    ' + x + ' ' + str(v_atts.dimensions)
+
+        try:
+            var_line += ' : long_name = ' + v_atts.long_name
+        except AttributeError:
+            pass
+        try:
+            var_line += ' (' + v_atts.units + ')'
+        except AttributeError:
+            pass
+        var_line += ' : type ' + str(v_atts.datatype)
+
+        print(var_line)
+
+        lines = txt.count('\n') + var_line.count('\n')
+        # print("lines ", lines)
+        if lines > 57:
+            #print(txt)
+            print('new page')
+            plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace')
+            plt.axis('off')
+            pp.savefig()
+            plt.close()
+            plt.figure(figsize=(11.69, 8.27))
+
+            txt = ""
+
+            lines = 0
+
+        txt += var_line + '\n'
+
+    plt.figure(figsize=(11.69, 8.27))
+
+    plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace')
+    plt.axis('off')
+    pp.savefig()
+    plt.close()
+
+    txt = ""
+    plt.figure(figsize=(11.69, 8.27))
+
+    lines = 0
+    # print "NetCDF Global Attributes:"
+    for nc_attr in sorted(nc.ncattrs(), key=lambda s: s.lower()):
+        #print('\t%s:' % nc_attr, repr(nc.getncattr(nc_attr)))
+        attrib_txt = nc_attr + ' : ' + str(nc.getncattr(nc_attr)).replace('\n', '\n   ') + '\n'
+        lines = txt.count('\n') + attrib_txt.count('\n')
+        # print("lines ", lines)
+        if lines > 57:
+            #print(txt)
+            print('new page')
+            plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace')
+            plt.axis('off')
+            pp.savefig()
+            plt.close()
+            plt.figure(figsize=(11.69, 8.27))
+
+            txt = ""
+
+            lines = 0
+
+        txt += attrib_txt
+
+        lines += 1
+
+    #print(txt)
+    plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace')
+    plt.axis('off')
+    pp.savefig()
+    plt.close()
+
+    # plot each variable in the to_plot list
+    for plot in to_plot:
+
+        plot_var = nc.variables[plot]
+
+        var = plot_var[:]
+        shape_len = len(var.shape)
+
+        # create a page with information about the variable, and any aux variables
+        fig = plt.figure(figsize=(11.69, 8.27))
+
+        text = "Variable : " + plot_var.name + str(plot_var.dimensions) + "\n"
+        nc_attrs = plot_var.ncattrs()
+        # print "NetCDF Variable Attributes:"
+        for nc_attr in nc_attrs:
+            attrVal = plot_var.getncattr(nc_attr)
+            #print('\t%s:' % nc_attr, repr(plot_var.getncattr(nc_attr)), type(attrVal))
+            text += nc_attr + ' : ' + str(attrVal) + '\n'
+
+        if hasattr(plot_var, 'ancillary_variables'):
+            qc_var_name = plot_var.getncattr('ancillary_variables')
+            qc_var = nc.variables[qc_var_name]
+
+            text += "\nAUX : " + qc_var.name + str(qc_var.dimensions) + "\n"
+
+            nc_attrs = qc_var.ncattrs()
+            # print "NetCDF AUX Variable Attributes:"
+            for nc_attr in nc_attrs:
+                # print '\t%s:' % nc_attr, repr(nc.getncattr(nc_attr))
+                text += nc_attr + ' : ' + str(qc_var.getncattr(nc_attr)) + '\n'
+
+            qc = nc.variables[qc_var_name][:]
+
+            if plot_var.dimensions[0] != 'TIME':
+                qc = np.transpose(qc)
+
+            qc = np.squeeze(qc)
+        else:
+            qc = 0
+
+        plt.text(-0.1, 0.0, text, fontsize=8, family='monospace')
+        plt.axis('off')
+        pp.savefig(fig)
+        plt.close(fig)
+
+        print(plot_var.name, " shape ", var.shape, " len ", shape_len)
+
+        # now create a page with the plot
+
+        fig = plt.figure(figsize=(11.69, 8.27))
+        ax = plt.subplot(111)
+
+        if plot_var.dimensions[0] != 'TIME':
+            var = np.transpose(var)
+        var = np.squeeze(var)
+
+        # create range from only good data
+        qc_m = np.ma.masked_where((qc == 9) | (qc == 4) | (qc == 6), var)
+        mx = qc_m.max()
+        mi = qc_m.min()
+
+        marg = (mx - mi) * 0.1
+        print("max ", mx, " min ", mi)
+
+        plt.ylim([mi - marg, mx + marg])
+
+        # create a legend entry made from serial_number and depth
+        if hasattr(plot_var, 'sensor_serial_number'):
+            sn = plot_var.getncattr('sensor_serial_number').split('; ')
+        elif hasattr(plot_var, 'sensor_serial_number'):
+            sn = nc.getncattr('instrument_serial_number').split('; ')
+        else:
+            sn = 'not found'
+
+        if hasattr(plot_var, 'sensor_depth'):
+            dpth = plot_var.getncattr('sensor_depth').split('; ')
+        elif hasattr(plot_var, 'sensor_height'):
+            dpth = plot_var.getncattr('sensor_height').split('; ')
+        elif hasattr(nc, 'instrument_nominal_depth'):
+            dpth = str(nc.getncattr('instrument_nominal_depth')).split('; ')
+        else:
+            dpth = 'unknown'
+
+        print("depth ", dpth)
+
+        leg = [x + ' (' + y + ' m)' for x, y in zip(sn, dpth)]
+
+        # if less than 200 points plot with a dot and line
+        plot_marks = '-'
+        if len(dt_time) < 200:
+            plot_marks = '.-'
+
+        pl = ax.plot(dt_time, qc_m, plot_marks)
+
+        # mark qc>2 with yellow dot, qc>3 with red dot
+        qc_m = np.ma.masked_where((qc <= 2) | (qc == 8), var)
+        ax.plot(dt_time, qc_m, 'yo')
+        qc_m = np.ma.masked_where((qc <= 3) | (qc == 8), var)
+        ax.plot(dt_time, qc_m, 'ro')
+
+        # shrink the plot some
+        box = ax.get_position()
+        ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
+
+        # add legend below plot
+        #plt.legend(iter(pl), leg, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=5)
+
+        plt.legend(iter(pl), leg, bbox_to_anchor=(0.0, -0.2, 1.0, -0.15), loc=3, ncol=6, mode="expand", borderaxespad=0.0, fontsize='x-small')
+
+        # invert the yaxis if the units are dbar
+        try:
+            if plot_var.units == 'dbar':
+                plt.gca().invert_yaxis()
+        except AttributeError:
+            pass
+        try:
+            if plot_var.positive == 'down':
+                plt.gca().invert_yaxis()
+        except AttributeError:
+            pass
+
+        #fig.autofmt_xdate()
+        plt.grid()
+
+        # add deployment/instrument/standard name as title
+
+        # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.sensor_name + ' ' + \
+        #          plot_var.sensor_serial_number + ' : ' + plot_var.name, fontsize=10)
+
+        # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.getncattr('name'), fontsize=10)
+        try:
+            plt.title(nc.getncattr('deployment_code'), fontsize=10)
+        except AttributeError:
+            pass
+
+        # add units to Y axis
+        try:
+            plt.ylabel(plot + ' (' + plot_var.units + ')')
+        except AttributeError:
+            pass
+
+        date_time_start = None
+        date_time_end = None
+
+        # plot only the time of deployment
+        try:
+            date_time_start = dt.datetime.strptime(nc.getncattr('time_coverage_start'), '%Y-%m-%dT%H:%M:%SZ')
+            date_time_end = dt.datetime.strptime(nc.getncattr('time_coverage_end'), '%Y-%m-%dT%H:%M:%SZ')
+        except AttributeError:
+            pass
+        try:
+            date_time_start = dt.datetime.strptime(nc.getncattr('time_deployment_start'), '%Y-%m-%dT%H:%M:%SZ')
+            date_time_end = dt.datetime.strptime(nc.getncattr('time_deployment_end'), '%Y-%m-%dT%H:%M:%SZ')
+        except AttributeError:
+            pass
+
+        if date_time_start:
+            plt.xlim(date_time_start, date_time_end)
+
+        # plt.savefig(plot + '.pdf')
+        pp.savefig(fig, papertype='a4')
+        plt.close(fig)
+
+    # plt.show()
+
+    pp.close()
+
+    nc.close()
diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py
index a9e7d23..79ab981 100755
--- a/ocean_dp/qc/flatline_test.py
+++ b/ocean_dp/qc/flatline_test.py
@@ -28,13 +28,13 @@
 
 
 # If files aren't specified, take all the IMOS*.nc files in the current folder
-def flatline_test_all_files(target_vars_in=[], window=3, flag=4):
+def flatline_test_all_files(target_vars_in=[], window=5, flag=4):
     target_files = glob.glob('IMOS*.nc')
 
     flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag)
 
 
-def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4):
+def flatline_test_files(target_files, target_vars_in=[], window=5, flag=4):
     
     # Loop through each files in target_files
     for current_file in target_files:
@@ -48,74 +48,87 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4):
         flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag)
 
 
-def flatline_test(nc, target_vars_in=[], window=3, flag=4):
+def flatline_test(nc, target_vars_in=[], window=5, flag=4):
     
-        # If target_vars aren't user specified, set it to all the variables of 
-        # the current_file, removing unwanted variables
-        if target_vars_in == []:
-            
-            target_vars = list(nc.variables.keys())
-            
-            # Remove TIME
-            target_vars.remove('TIME')
-            
-            # Remove any quality_control variables
-            qc_vars = [s for s in target_vars if 'quality_control' in s]
-            target_vars = [s for s in target_vars if s not in qc_vars]
-                            
-            # Remove any variables of single length
-            single_vars = [s for s in target_vars if nc.variables[s].size==1]
-            target_vars = [s for s in target_vars if s not in single_vars]
-            
-            print('target_vars are '+' '.join(target_vars))
-            
-        else:
-            target_vars = target_vars_in
-            
-        # For each variable, extract the data 
-        for current_var in target_vars:
-            
-            # Extract the variable
-            nc_var = nc.variables[current_var]
-            
-            if nc_var.name + "_quality_control_flt" in nc.variables:
-                ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"]
-            else:
-                ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-                ncVarOut[:] = np.zeros(nc_var.shape)
-                ncVarOut.long_name = "quality flag for " + nc_var.name
-                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
-                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+    print('Window is '+str(window))
+    
+    # If target_vars aren't user specified, set it to all the variables of 
+    # the current_file, removing unwanted variables
+    if target_vars_in == []:
+        
+        target_vars = list(nc.variables.keys())
+        
+        # Remove TIME
+        target_vars.remove('TIME')
+        
+        # Remove any quality_control variables
+        qc_vars = [s for s in target_vars if 'quality_control' in s]
+        target_vars = [s for s in target_vars if s not in qc_vars]
+                        
+        # Remove any variables of single length
+        single_vars = [s for s in target_vars if nc.variables[s].size==1]
+        target_vars = [s for s in target_vars if s not in single_vars]
+        
+        print('target_vars are '+' '.join(target_vars))
         
+    else:
+        target_vars = target_vars_in
+        
+    # For each variable, extract the data 
+    for current_var in target_vars:
+        
+        # Extract the variable
+        nc_var = nc.variables[current_var]
+        
+        if nc_var.name + "_quality_control_flt" in nc.variables:
+            print('flt qc variable already present')
+            ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"]
+            ncVarOut[:] = 0
+        else:
+            ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+            ncVarOut[:] = 0
+            # print(all(nc.variables[nc_var.name + "_quality_control_flt"]==0))
+            ncVarOut.long_name = "quality flag for " + nc_var.name
+            ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+            ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+
             # add new variable to list of aux variables
             nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_flt"
-            
-            var_data = np.array(nc.variables[current_var])
-            
-            print('checking '+current_var)
-            
-            # Step through the data, one element at a time, using the window
-            for i in range(0,(len(var_data)-window+1)):
-                
-                # This is true if 'window' elements in a row are equal
-                if len(set(var_data[i:(i+window)])) == 1:
-                    
-                    # set corresponding QC value to...
-                    nc.variables[current_var+'_quality_control_flt'][i:(i+window)] = flag
-                    
-            nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_flt"][:],nc.variables[current_var  + "_quality_control"][:])
-
-        # update the history attribute
-        try:
-            hist = nc.history + "\n"
-        except AttributeError:
-            hist = ""
-            
-            
-
-        nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
-
-        nc.close()
+
+        var_data = np.array(nc.variables[current_var][:])
+
+        if (all(nc.variables[nc_var.name + "_quality_control_flt"][:] == 0)):
+            print('All test specific qc values are zero before filling')
+
+        print('checking ' + current_var)
+
+        print('Window is ' + str(window))
+
+        # Step through the data, one element at a time, using the window
+        for i in range(0, (len(var_data) - window + 1)):
+
+            # This is true if 'window' elements in a row are equal
+            if len(set(var_data[i:(i + window)])) == 1:
+                print(str(i))
+                # set corresponding QC value to...
+                ncVarOut[i:(i + window)] = flag
+
+        points_marked = len([elem for elem in ncVarOut[:] if elem == 4])
+        print('Data points flagged: ', points_marked)
+
+        qc_var = nc.variables[current_var + "_quality_control"]
+        qc_var[:] = np.maximum(ncVarOut[:], qc_var[:])
+    # update the history attribute
+    try:
+        hist = nc.history + "\n"
+    except AttributeError:
+        hist = ""
+        
+        
+
+    nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) )
+
+    nc.close()
     
 if __name__ == "__main__":
     # usage is <file_name> <variable_name> <window> <flag value>
diff --git a/ocean_dp/qc/global_range.py b/ocean_dp/qc/global_range.py
index 68e2ee0..db3b80b 100644
--- a/ocean_dp/qc/global_range.py
+++ b/ocean_dp/qc/global_range.py
@@ -57,9 +57,10 @@ def global_range(netCDFfile, variable, max, min, qc_value=4):
     # create a qc variable just for this test flags
     if nc_var.name + "_quality_control_gr" in ds.variables:
         ncVarOut = ds.variables[nc_var.name + "_quality_control_gr"]
+        ncVarOut[:] = 0
     else:
         ncVarOut = ds.createVariable(nc_var.name + "_quality_control_gr", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-        ncVarOut[:] = np.zeros(nc_var.shape)
+        ncVarOut[:] = 0
         ncVarOut.long_name = "quality flag for " + nc_var.name
         ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
         ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py
index e229d72..90e83e8 100755
--- a/ocean_dp/qc/qc_checker.py
+++ b/ocean_dp/qc/qc_checker.py
@@ -115,7 +115,11 @@ def qc_checker(nc,target_vars_in=[]):
     nc.close()
     
     
-    
+# def qc_check_plot(target_file,target_var)
+
+#     nc = Dataset(target_file,'r')
+
+        
     
     
     
diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py
index a7f54d1..84f8e4f 100755
--- a/ocean_dp/qc/rate_of_change_test.py
+++ b/ocean_dp/qc/rate_of_change_test.py
@@ -102,26 +102,38 @@ def roc_test(nc,*args,target_vars_in=[]):
                 
                 if nc_var.name + "_quality_control_roc" in nc.variables:
                     ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"]
+                    ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape)
                 else:
                     ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-                    ncVarOut[:] = np.zeros(nc_var.shape)
+                    ncVarOut[:] = 0
                     ncVarOut.long_name = "quality flag for " + nc_var.name
                     ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
                     ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
             
-                # add new variable to list of aux variables
-                nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
+                    # add new variable to list of aux variables
+                    nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
                 
                 # Extract the variable data
                 var_data = np.array(nc.variables[current_var][:])
                 
+                print('Not equal to zero test type 1')
+                print(str(np.where(ncVarOut[:]!=0)))
+                
+                if (all(ncVarOut[:] == 0)):
+                    print('All test specific qc values are zero before filling')
+                
                 # Calculate dvar/dtime
                 var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
-                nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+                ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4
+
+                # Extract global qc
+                qc_var = nc.variables[current_var + "_quality_control"]
+                
+                # Overwrite global qc with any higher values from test specific qc
+                qc_var[:] = np.maximum(ncVarOut[:], qc_var[:])
                 
-                nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_roc"][:],nc.variables[current_var  + "_quality_control"][:])
                 
                 print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour')
                     
@@ -159,27 +171,50 @@ def roc_test(nc,*args,target_vars_in=[]):
                 
                 if nc_var.name + "_quality_control_roc" in nc.variables:
                     ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"]
+                    ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape)
                 else:
                     ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-                    ncVarOut[:] = np.zeros(nc_var.shape)
+                    ncVarOut[:] = 0
                     ncVarOut.long_name = "quality flag for " + nc_var.name
                     ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
                     ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
             
-                # add new variable to list of aux variables
-                nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
-                                
+                    # add new variable to list of aux variables
+                    nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc"
+                       
                 
                 # Extract the data
                 var_data = np.array(nc.variables[current_var])
                 
+                print('Not equal to zero test type 2')
+                print(str(np.where(ncVarOut[:]!=0)))
+                
+                if (all(ncVarOut[:] == 0)):
+                    print('All test specific qc values are zero before filling')
+                
                 # Calculate dvar/dtime
                 var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr))
                 
                 # For any change greater than change_per_hr, assign a qc value of 4
-                nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4
+                
+                print('ncVarOut 4s')
+                print(str(np.where(ncVarOut[:]==4)))
+                
+                print('Netcdf variable 4s')
+                print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4)))
+                
+                nc.variables[nc_var.name + "_quality_control_roc"][:] = ncVarOut[:]
+                
+                print('Netcdf variable 4s after assignment')
+                print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4)))
+                
+                
+                # Extract global qc
+                qc_var = nc.variables[current_var + "_quality_control"]
                 
-                nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_roc"][:],nc.variables[current_var  + "_quality_control"][:])
+                # Overwrite global qc with any higher values from test specific qc
+                qc_var[:] = np.maximum(ncVarOut[:], qc_var[:])
                 
                 print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour')
                     
diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py
index 622d3a3..089cf63 100755
--- a/ocean_dp/qc/spike_test.py
+++ b/ocean_dp/qc/spike_test.py
@@ -115,15 +115,15 @@ def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=defaul
                 print('High spike found')
                 
                 #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control_spk'][i] = flag_high
+                ncVarOut[i] = flag_high
                 
         
         # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike
-        low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1]
+        low_spike_chk_idx = np.where(ncVarOut[:]!=4)[0][0:-1]
 
         
         # Remove from the indices those that are either side of a high spike
-        for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]:
+        for i in ncVarOut[:]==4:
             
             low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i-1]]
             
@@ -150,7 +150,7 @@ def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=defaul
                 print('Low spike found')
                 
                 #set corresponding QC value to...
-                nc.variables[current_var+'_quality_control_spk'][i] = flag_low   
+                ncVarOut[i] = flag_low   
                 
     nc.variables[current_var  + "_quality_control"][:] = np.maximum(nc.variables[current_var  + "_quality_control_spk"][:],nc.variables[current_var  + "_quality_control"][:])
 
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index 6c8bd90..75fe6f4 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -40,7 +40,7 @@
 start = time.time()
 
 # Set the working directory
-os.chdir('/Users/tru050/Desktop/sofs7.5 test data')
+os.chdir('/Users/tru050/Desktop/sofs6 test data')
 
 # Make a list of FV00 filenames
 fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc')
@@ -70,13 +70,13 @@
     global_range.global_range(ncfile,'TEMP',40,-2)
 
 # Rate of change
-rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20)
+rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',10)
 
 # Spike
 spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP'])
 
 # Flatline
-flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'])
+flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=10)
 
 # Check qc process has worked
 fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP'])

From 43dd011c0a4b308aecc79c344b47a96c75414153 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 19 May 2020 17:17:28 +1000
Subject: [PATCH 39/59] updating petes plotQC, building temp_stat_plot

---
 ocean_dp/plotting/plotQC.py            |  88 ++++++++++++++++++++
 ocean_dp/qc/temp_diff_hist_extra.py    |  16 ++++
 ocean_dp/qc/temp_diff_histograms.py    |   4 +-
 ocean_dp/qc/temp_stat_plot.py          | 107 +++++++++++++++++++++++++
 ocean_dp/sots_processing_runthrough.py |   2 +-
 5 files changed, 214 insertions(+), 3 deletions(-)
 create mode 100755 ocean_dp/plotting/plotQC.py
 create mode 100755 ocean_dp/qc/temp_stat_plot.py

diff --git a/ocean_dp/plotting/plotQC.py b/ocean_dp/plotting/plotQC.py
new file mode 100755
index 0000000..3060759
--- /dev/null
+++ b/ocean_dp/plotting/plotQC.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+# raw2netCDF
+# Copyright (C) 2019 Peter Jansen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import xarray as xr
+
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+import seaborn as sns
+
+from pandas.plotting import register_matplotlib_converters
+
+import sys
+import os
+
+
+def plot_all(files):
+    register_matplotlib_converters()
+    plt.style.use('seaborn-darkgrid')
+    sns.set_context("paper")
+
+    pp = PdfPages(os.path.join(os.path.dirname(files[0]), "batch-qc.pdf"))
+
+    for f in files:
+        print("file ", f)
+        do_plot(f)
+
+        pp.savefig()
+        plt.close()
+
+    pp.close()
+
+
+def plot(fn):
+    register_matplotlib_converters()
+    plt.style.use('seaborn-darkgrid')
+    sns.set_context("paper")
+
+    pp = PdfPages(fn + "-qc.pdf")
+
+    do_plot(fn)
+
+    pp.savefig()
+    pp.close()
+    plt.close()
+
+
+def do_plot(fn):
+
+    #fn = sys.argv[1]
+    #fn = '/Users/pete/cloudstor/SOTS-Temp-Raw-Data/SOFS-7.5-2018/netCDF/IMOS_ABOS-SOTS_TIP_20180801_SOFS_FV01_SOFS-7.5-2018-Starmon-mini-4047-40m_END-20190331_C-20200429.nc'
+
+    DS = xr.open_dataset(fn)
+
+    ax1 = plt.subplot(2, 1, 1)
+    plt.plot(DS.TIME, DS.PAR)
+    plt.title(DS.deployment_code + " - " + DS.instrument_model + ":" + DS.instrument_serial_number + " @ " + str(DS.instrument_nominal_depth), {'fontsize': 8})
+
+    ax2 = plt.subplot(2, 1, 2, sharex=ax1)
+    aux = DS.PAR.ancillary_variables
+    a_vars = aux.split(" ")
+    for f in sorted(set(a_vars)):
+        print('aux var', f)
+        varn = f.split("_")
+        plt.plot(DS.TIME, DS.variables[f], label=varn[-1])
+    plt.ylim(0, 9)
+
+    plt.legend(prop={'size': 6})
+
+    DS.close()
+
+
+if __name__ == "__main__":
+    plot_all(sys.argv[1:])
\ No newline at end of file
diff --git a/ocean_dp/qc/temp_diff_hist_extra.py b/ocean_dp/qc/temp_diff_hist_extra.py
index d8b6019..14dc6cf 100755
--- a/ocean_dp/qc/temp_diff_hist_extra.py
+++ b/ocean_dp/qc/temp_diff_hist_extra.py
@@ -13,6 +13,22 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+import glob
+from netCDF4 import num2date
+from dateutil import parser
+import datetime
+
 for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
     
     for fname in files:
diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py
index 2003afc..a3f8420 100755
--- a/ocean_dp/qc/temp_diff_histograms.py
+++ b/ocean_dp/qc/temp_diff_histograms.py
@@ -130,7 +130,7 @@ def last_four(entry):
 deployments.sort(key=last_four)
 
 
-
+#######
 
 
 
@@ -207,7 +207,7 @@ def last_four(entry):
                 
 for plt_idx,dep_name in zip(range(0,len(deployments)),deployments):            
 
-    print('plotting '+ str(len(all_deployment_dtemp_dtime[plt_idx])) + ' values')
+    print('plotting '+ str((all_deployment_dtemp_dtime[plt_idx])) + ' values')
             
     hist_data = ax[plt_idx].hist(all_deployment_dtemp_dtime[plt_idx],21,log=True)
     
diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
new file mode 100755
index 0000000..752ebe5
--- /dev/null
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -0,0 +1,107 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+import pandas as pd
+
+# creates an empty array to store the names of the SOTS deployments
+deployments = []
+
+# loops through all the folders and files contained in the folder
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments
+    if (('Pulse' in x) or ('SOFS' in x)) and ('.' not in x):
+        
+        deployments.append(x)
+    
+# create a dataframe to store extract information
+temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"])
+
+# loops through all files in the directory
+for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    for fname in files:
+        
+        # for each netcdf file labelled as FV01
+        if fname.endswith('.nc') and 'FV01' in fname:
+        
+            # print the filename
+            print(fname)  
+            
+            # open the file
+            nc = Dataset(os.path.join(root,fname), mode = 'r')
+            
+            # check that the in_out_water test has been run on the file
+            if 'TEMP_quality_control_io' in list(nc.variables):
+                
+                # check that the file has a single dimension temperature vector, and that the time format is correct
+                if np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                    
+                    # calculate temperature changes for in water data
+                    nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7]))
+                
+                    # extract the time data
+                    nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7])
+            
+                    # Convert from days to hours
+                    nc_time_hr = nc_time*24
+                    
+                    # Calculate time changes in hours
+                    nc_time_hr_diffs = np.diff(nc_time_hr)
+                    
+                    # calculate the rate of change of temperature wrt time (degrees °C per hour)
+                    nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs)
+                    
+                    
+                    
+                    # extract temp_qc data
+                    nc_temp_qc = np.array(nc.variables['TEMP_quality_control'][np.array(nc.variables['TEMP_quality_control'][:])!=7])
+                    
+                    # calculate qc values for each nc_dtemp_dtime by taking the maximum of the qc values of the two contributing temps
+                    nc_dtemp_dtime_qc = pd.Series(nc_temp_qc).rolling(2).max().dropna().to_numpy()
+                    
+                    
+                    # extract sensor nominal depth
+                    nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH'])
+                    
+                    
+                    # extract deployment name
+                    nc_deployment = nc.deployment_code
+                    
+                    # Next step: append all this information to temp ensemble!
+                    
+            nc.close()
+                    
+                    
+            
+            # pd.Series(lst).rolling(5).max().dropna().to_numpy()
+            
+            
+            
+            
+            
\ No newline at end of file
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index 75fe6f4..c939838 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -40,7 +40,7 @@
 start = time.time()
 
 # Set the working directory
-os.chdir('/Users/tru050/Desktop/sofs6 test data')
+#os.chdir('/Users/tru050/Desktop/sofs6 test data')
 
 # Make a list of FV00 filenames
 fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc')

From 4a9590f5c8dfd95ccd2ad9d67baccdea4a097556 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 20 May 2020 10:15:26 +1000
Subject: [PATCH 40/59] Update temp_stat_plot.py

Code loads and processes SOTS data for std calc
---
 ocean_dp/qc/temp_stat_plot.py | 189 ++++++++++++++++++++++++++++++++--
 1 file changed, 178 insertions(+), 11 deletions(-)

diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index 752ebe5..ab93f2c 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -28,36 +28,100 @@
 from sigfig import round
 import pandas as pd
 
+############################# Data extraction ################################
+
 # creates an empty array to store the names of the SOTS deployments
 deployments = []
 
+checked_files = []
+
+processed_files = []
+
 # loops through all the folders and files contained in the folder
 for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
     
     # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments
-    if (('Pulse' in x) or ('SOFS' in x)) and ('.' not in x):
+    if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x):
         
         deployments.append(x)
     
 # create a dataframe to store extract information
-temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"])
+sots_temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"])
 
 # loops through all files in the directory
 for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
     
     for fname in files:
         
-        # for each netcdf file labelled as FV01
-        if fname.endswith('.nc') and 'FV01' in fname:
+        # append the filename to the list of checked files
+        checked_files.append(fname)
+        
+        # for each netcdf file labelled as FV01 and containing a deployment in its name
+        if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments):
         
             # print the filename
             print(fname)  
             
             # open the file
-            nc = Dataset(os.path.join(root,fname), mode = 'r')
+            nc = Dataset(os.path.join(root,fname), mode = 'a')
+            
+            # check file contains temperature data
+            if 'TEMP' in list(nc.variables):
             
-            # check that the in_out_water test has been run on the file
-            if 'TEMP_quality_control_io' in list(nc.variables):
+                # check that the in_out_water test has been run on the file, if not run in_out_water code
+                if not 'TEMP_quality_control_io' in list(nc.variables):
+                
+                    # run in_out_water script - uncommented at this point as just copied and pasted
+                    var_name = 'TEMP'
+                    nc_vars = nc.variables
+                    to_add = []
+                    if var_name:
+                        to_add.append(var_name)
+                    else:
+                        for v in nc_vars:
+                            #print (vars[v].dimensions)
+                            if v != 'TIME':
+                                to_add.append(v)
+                
+                    time_var = nc_vars["TIME"]
+                    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+                
+                    time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+                    time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+                
+                    print('deployment time', time_deploy)
+                
+                    print(to_add)
+                
+                    # create a mask for the time range
+                    mask = (time <= time_deploy) | (time >= time_recovery)
+                
+                    for v in to_add:
+                        if "TIME" in nc_vars[v].dimensions:
+                            if v.endswith("_quality_control"):
+                                print("QC time dim ", v)
+                
+                                ncVarOut = nc_vars[v]
+                                ncVarOut[mask] = 7
+                            else:
+                                # create a qc variable just for this test flags
+                                if v + "_quality_control_io" in nc.variables:
+                                    ncVarOut = nc.variables[v + "_quality_control_io"]
+                                else:
+                                    ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                                ncVarOut[:] = np.zeros(nc_vars[v].shape)
+                                ncVarOut.long_name = "quality flag for " + v
+                                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+                
+                                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
+                                ncVarOut[mask] = 7
+                        
+                        nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:])
+                
+                    nc.file_version = "Level 1 - Quality Controlled Data"
+                
+                
                 
                 # check that the file has a single dimension temperature vector, and that the time format is correct
                 if np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
@@ -86,22 +150,125 @@
                     nc_dtemp_dtime_qc = pd.Series(nc_temp_qc).rolling(2).max().dropna().to_numpy()
                     
                     
+                    
                     # extract sensor nominal depth
                     nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH'])
                     
+                    # create a vector the same size as nc_dtemp_dtime with the nominal depth
+                    nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dtemp_dtime))
+                    
+                    
                     
                     # extract deployment name
                     nc_deployment = nc.deployment_code
                     
-                    # Next step: append all this information to temp ensemble!
+                    # create a list the same size as nc_dtemp_dtime with the deployment name
+                    nc_deployment_list = [nc_deployment] * len(nc_dtemp_dtime)
+                    
+                    
+                    
+                    # combine information into an length x 4 dataframe
+                    nc_temp_ensemble = pd.DataFrame({"Temp rate of change":nc_dtemp_dtime,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
+                    
+                    # append the current netcdf's dataframe to the sots_temp_ensemble
+                    sots_temp_ensemble = sots_temp_ensemble.append(nc_temp_ensemble)
+                    
+                    # append the filename to the list of processed files
+                    processed_files.append(fname)
+                    
                     
             nc.close()
                     
                     
+############################# Data processing ################################
             
-            # pd.Series(lst).rolling(5).max().dropna().to_numpy()
-            
-            
+# creates a new dataframe containing only data with QC < 3
+sots_temp_ensemble_qc210 = sots_temp_ensemble[sots_temp_ensemble["QC"]<3]
+
+# calculates overall standard deviation
+std_total = np.std(sots_temp_ensemble_qc210["Temp rate of change"])
+
+
+
+
+# creates an emply list to store data deployment by deployment
+std_by_deployment_data = []
+
+# creates a dict of deployment names and standard deviations
+for i in sots_temp_ensemble_qc210.Deployment.unique():
+    std_by_deployment_data.append(
+        {
+            'Deployment': i,
+            'STD': np.std(sots_temp_ensemble_qc210["Temp rate of change"][sots_temp_ensemble_qc210["Deployment"]==i]),
+        }
+    )
+
+# creates a Dataframe from the dict
+std_by_deployment = pd.DataFrame(std_by_deployment_data)
+
+
+
+
+
+# =============================================================================
+# std_by_depth: this function takes two compulsary arguments (top: the shallowest 
+# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: 
+# the deployment from which data will be taken). The function will return the standard
+# deviation of the d(Temp)/d(Time) data from sensors with nominal depths at and 
+# between the two depths, and from only the deployment_in if specified.
+#
+# sample call: std_by_depth(500,10000,'SOFS-7.5-2018')
+#
+# this will give the std of all d(Temp)/d(Time) data from SOFS-7.5-2018 from 
+# sensors with 500m <= nominal depth <= 10000m
+# =============================================================================
+
+def std_by_depth(top,bottom,deployment_in=None):
+    
+    if deployment_in == None:
+    
+        # subsamples sots_temp_ensemble_qc210 based on depth
+        target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)]
+        
+        # calculates the standard deviation of the subsample
+        target_std = np.std(target_ensemble["Temp rate of change"])
+        
+        # returns the standard deviation of the subsample
+        return target_std
+    
+    else:   
+    
+        # subsamples sots_temp_ensemble_qc210 based on depth
+        target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)]
+        
+        # calculates the standard deviation of the subsample
+        target_std = np.std(target_ensemble["Temp rate of change"])
+        
+        # returns the standard deviation of the subsample
+        return target_std    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
             
             
             
\ No newline at end of file

From f1d5618bc204a64bb15c19a4d525832eee08bd58 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 20 May 2020 11:53:13 +1000
Subject: [PATCH 41/59] Update temp_stat_plot.py

adding plotting to function and improved if else layout
---
 ocean_dp/qc/temp_stat_plot.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index ab93f2c..bc73767 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -230,22 +230,32 @@ def std_by_depth(top,bottom,deployment_in=None):
         # subsamples sots_temp_ensemble_qc210 based on depth
         target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)]
         
-        # calculates the standard deviation of the subsample
-        target_std = np.std(target_ensemble["Temp rate of change"])
-        
-        # returns the standard deviation of the subsample
-        return target_std
-    
     else:   
     
         # subsamples sots_temp_ensemble_qc210 based on depth
         target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)]
         
-        # calculates the standard deviation of the subsample
-        target_std = np.std(target_ensemble["Temp rate of change"])
+    # calculates the mean of the subsample
+    target_mean = np.mean(target_ensemble["Temp rate of change"])
+            
+    # calculates the standard deviation of the subsample
+    target_std = np.std(target_ensemble["Temp rate of change"])
         
-        # returns the standard deviation of the subsample
-        return target_std    
+    line_thick = 1
+    
+    ax_hist=plt.axes()
+    
+    target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist)
+    
+    ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) 
+
+    ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 
+    
+    
+    # returns the standard deviation of the subsample
+    return target_std
+    
+
 
 
 

From 528c5cf2de4062d76e59edf0524ee3405848a144 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 20 May 2020 11:54:25 +1000
Subject: [PATCH 42/59] Update temp_stat_plot.py

---
 ocean_dp/qc/temp_stat_plot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index bc73767..63673ba 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -241,12 +241,16 @@ def std_by_depth(top,bottom,deployment_in=None):
     # calculates the standard deviation of the subsample
     target_std = np.std(target_ensemble["Temp rate of change"])
         
+    # sets line thickness for plot
     line_thick = 1
     
+    # creates axes for histogram
     ax_hist=plt.axes()
     
+    # plots a histogram of the data selected
     target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist)
     
+    # draws lines at the mean +- 3 STD on the histogram
     ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) 
 
     ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 

From 6f3748ecb6dcd061acae485cc099e5ad22fc5ded Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 20 May 2020 14:45:28 +1000
Subject: [PATCH 43/59] Update temp_stat_plot.py

---
 ocean_dp/qc/temp_stat_plot.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index 63673ba..fd0b86a 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -255,6 +255,30 @@ def std_by_depth(top,bottom,deployment_in=None):
 
     ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 
     
+    # sets the x label
+    ax_hist.set_xlabel('°C/hr')
+    
+    
+    label_coords = (0.65, 0.8)
+    label_method = 'axes fraction' 
+    
+    anno = 'mean = '+str(round(float(target_mean),sigfigs=3))
+    
+    anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3))
+    
+    anno += '\nno. samples = ' + str(len(target_ensemble))
+    
+    anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m'
+    
+    if deployment_in == None:
+        
+        anno += '\nall available data'
+        
+    else:
+        
+        anno += '\n'+deployment_in
+            
+    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
     
     # returns the standard deviation of the subsample
     return target_std

From 7d28a8c13ceb46e64b14e82e9565502a4fed916c Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 20 May 2020 16:19:29 +1000
Subject: [PATCH 44/59] creation of psal stat plot

---
 ocean_dp/qc/psal_stat_plot.py | 310 ++++++++++++++++++++++++++++++++++
 ocean_dp/qc/temp_stat_plot.py |   2 +-
 2 files changed, 311 insertions(+), 1 deletion(-)
 create mode 100755 ocean_dp/qc/psal_stat_plot.py

diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py
new file mode 100755
index 0000000..8449b2a
--- /dev/null
+++ b/ocean_dp/qc/psal_stat_plot.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+import pandas as pd
+
+############################# Data extraction ################################
+
+# creates an empty array to store the names of the SOTS deployments
+deployments = []
+
+checked_files = []
+
+processed_files = []
+
+# loops through all the folders and files contained in the folder
+for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments
+    if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x):
+        
+        deployments.append(x)
+        
+        
+# create a dataframe to store extract information
+sots_psal_ensemble = pd.DataFrame(columns = ["PSAL rate of change","QC","Nominal depth","Deployment"])
+
+
+# loops through all files in the directory
+for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+    
+    for fname in files:
+        
+        # append the filename to the list of checked files
+        checked_files.append(fname)
+        
+        # for each netcdf file labelled as FV01 and containing a deployment in its name
+        if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments):
+        
+            # print the filename
+            print(fname)  
+            
+            # open the file
+            nc = Dataset(os.path.join(root,fname), mode = 'a')
+            
+            # check file contains psalerature data
+            if 'PSAL' in list(nc.variables):
+            
+                # check that the in_out_water test has been run on the file, if not run in_out_water code
+                if not 'PSAL_quality_control_io' in list(nc.variables):
+                
+                    # run in_out_water script - uncommented at this point as just copied and pasted
+                    var_name = 'PSAL'
+                    nc_vars = nc.variables
+                    to_add = []
+                    if var_name:
+                        to_add.append(var_name)
+                    else:
+                        for v in nc_vars:
+                            #print (vars[v].dimensions)
+                            if v != 'TIME':
+                                to_add.append(v)
+                
+                    time_var = nc_vars["TIME"]
+                    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+                
+                    time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True)
+                    time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True)
+                
+                    print('deployment time', time_deploy)
+                
+                    print(to_add)
+                
+                    # create a mask for the time range
+                    mask = (time <= time_deploy) | (time >= time_recovery)
+                
+                    for v in to_add:
+                        if "TIME" in nc_vars[v].dimensions:
+                            if v.endswith("_quality_control"):
+                                print("QC time dim ", v)
+                
+                                ncVarOut = nc_vars[v]
+                                ncVarOut[mask] = 7
+                            else:
+                                # create a qc variable just for this test flags
+                                if v + "_quality_control_io" in nc.variables:
+                                    ncVarOut = nc.variables[v + "_quality_control_io"]
+                                else:
+                                    ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                                ncVarOut[:] = np.zeros(nc_vars[v].shape)
+                                ncVarOut.long_name = "quality flag for " + v
+                                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+                                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+                
+                                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
+                                ncVarOut[mask] = 7
+                        
+                        nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:])
+                
+                    nc.file_version = "Level 1 - Quality Controlled Data"
+                
+                
+                
+                # check that the file has a single dimension psalerature vector, and that the time format is correct
+                if np.array(nc.variables['PSAL'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC':
+                    
+                    # calculate psalerature changes for in water data
+                    nc_psal_diffs = np.diff(np.array(nc.variables['PSAL'][np.array(nc.variables['PSAL_quality_control'][:])!=7]))
+                
+                    # extract the time data
+                    nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['PSAL_quality_control'][:])!=7])
+            
+                    # Convert from days to hours
+                    nc_time_hr = nc_time*24
+                    
+                    # Calculate time changes in hours
+                    nc_time_hr_diffs = np.diff(nc_time_hr)
+                    
+                    # calculate the rate of change of psalerature wrt time (degrees °C per hour)
+                    nc_dpsal_dtime = np.divide(nc_psal_diffs,nc_time_hr_diffs)
+                    
+                    
+                    
+                    # extract psal_qc data
+                    nc_psal_qc = np.array(nc.variables['PSAL_quality_control'][np.array(nc.variables['PSAL_quality_control'][:])!=7])
+                    
+                    # calculate qc values for each nc_dpsal_dtime by taking the maximum of the qc values of the two contributing psals
+                    nc_dpsal_dtime_qc = pd.Series(nc_psal_qc).rolling(2).max().dropna().to_numpy()
+                    
+                    
+                    
+                    # extract sensor nominal depth
+                    nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH'])
+                    
+                    # create a vector the same size as nc_dpsal_dtime with the nominal depth
+                    nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dpsal_dtime))
+                    
+                    
+                    
+                    # extract deployment name
+                    nc_deployment = nc.deployment_code
+                    
+                    # create a list the same size as nc_dpsal_dtime with the deployment name
+                    nc_deployment_list = [nc_deployment] * len(nc_dpsal_dtime)
+                    
+                    
+                    
+                    # combine information into an length x 4 dataframe
+                    nc_psal_ensemble = pd.DataFrame({"Psal rate of change":nc_dpsal_dtime,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
+                    
+                    # append the current netcdf's dataframe to the sots_psal_ensemble
+                    sots_psal_ensemble = sots_psal_ensemble.append(nc_psal_ensemble)
+                    
+                    # append the filename to the list of processed files
+                    processed_files.append(fname)
+                    
+                    
+            nc.close()
+                    
+                    
+############################# Data processing ################################
+            
+# creates a new dataframe containing only data with QC < 3
+sots_psal_ensemble_qc210 = sots_psal_ensemble[sots_psal_ensemble["QC"]<3]
+
+# calculates overall standard deviation
+std_total = np.std(sots_psal_ensemble_qc210["Psal rate of change"])
+
+
+
+
+# creates an emply list to store data deployment by deployment
+std_by_deployment_data = []
+
+# creates a dict of deployment names and standard deviations
+for i in sots_psal_ensemble_qc210.Deployment.unique():
+    std_by_deployment_data.append(
+        {
+            'Deployment': i,
+            'STD': np.std(sots_psal_ensemble_qc210["Psal rate of change"][sots_psal_ensemble_qc210["Deployment"]==i]),
+        }
+    )
+
+# creates a Dataframe from the dict
+std_by_deployment = pd.DataFrame(std_by_deployment_data)
+
+
+
+
+
+# =============================================================================
+# std_by_depth: this function takes two compulsary arguments (top: the shallowest 
+# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: 
+# the deployment from which data will be taken). The function will return the standard
+# deviation of the d(psal)/d(Time) data from sensors with nominal depths at and 
+# between the two depths, and from only the deployment_in if specified.
+#
+# sample call: std_by_depth(500,10000,'SOFS-7.5-2018')
+#
+# this will give the std of all d(psal)/d(Time) data from SOFS-7.5-2018 from 
+# sensors with 500m <= nominal depth <= 10000m
+# =============================================================================
+
+def std_by_depth_psal(top,bottom,deployment_in=None):
+    
+    if deployment_in == None:
+    
+        # subsamples sots_psal_ensemble_qc210 based on depth
+        target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom)]
+        
+    else:   
+    
+        # subsamples sots_psal_ensemble_qc210 based on depth
+        target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210["Deployment"]==deployment_in)]
+        
+    # calculates the mean of the subsample
+    target_mean = np.mean(target_ensemble["Psal rate of change"])
+            
+    # calculates the standard deviation of the subsample
+    target_std = np.std(target_ensemble["Psal rate of change"])
+        
+    # sets line thickness for plot
+    line_thick = 1
+    
+    # creates axes for histogram
+    ax_hist=plt.axes()
+    
+    # plots a histogram of the data selected
+    target_ensemble.hist(column="Psal rate of change",bins=100,log=True,ax=ax_hist)
+    
+    # draws lines at the mean +- 3 STD on the histogram
+    ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) 
+
+    ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 
+    
+    # sets the x label
+    ax_hist.set_xlabel('PSU/hr')
+    
+    
+    label_coords = (0.65, 0.8)
+    label_method = 'axes fraction' 
+    
+    anno = 'mean = '+str(round(float(target_mean),sigfigs=3))
+    
+    anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3))
+    
+    anno += '\nno. samples = ' + str(len(target_ensemble))
+    
+    anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m'
+    
+    if deployment_in == None:
+        
+        anno += '\nall available data'
+        
+    else:
+        
+        anno += '\n'+deployment_in
+            
+    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
+    
+    # returns the standard deviation of the subsample
+    return target_std
+    
+
+
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
+        
\ No newline at end of file
diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index fd0b86a..6eda254 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -223,7 +223,7 @@
 # sensors with 500m <= nominal depth <= 10000m
 # =============================================================================
 
-def std_by_depth(top,bottom,deployment_in=None):
+def std_by_depth_temp(top,bottom,deployment_in=None):
     
     if deployment_in == None:
     

From 56e469a954da103104a8089a40d7cf4f807bd3dc Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 21 May 2020 09:59:19 +1000
Subject: [PATCH 45/59] modified stat plot code

-now swaps depths if inputted incorrectly
-can now input list of deployments
-fixed label location
---
 ocean_dp/qc/psal_stat_plot.py          | 33 ++++++++++++++++++++++----
 ocean_dp/qc/temp_stat_plot.py          | 27 +++++++++++++++++++--
 ocean_dp/sots_processing_runthrough.py |  2 +-
 3 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py
index 8449b2a..46999bc 100755
--- a/ocean_dp/qc/psal_stat_plot.py
+++ b/ocean_dp/qc/psal_stat_plot.py
@@ -228,15 +228,32 @@
 
 def std_by_depth_psal(top,bottom,deployment_in=None):
     
+    # if user incorrectly inputs depths, swap them and run the code
+    if top > bottom:
+        
+        top, bottom = bottom, top
+    
     if deployment_in == None:
     
         # subsamples sots_psal_ensemble_qc210 based on depth
         target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom)]
         
+    elif isinstance(deployment_in, list):
+        
+        # subsamples sots_psal_ensemble_qc210 based on depth
+        target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210.Deployment.isin(deployment_in))]
+        
+        
     else:   
     
         # subsamples sots_psal_ensemble_qc210 based on depth
         target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210["Deployment"]==deployment_in)]
+    
+    # if not data is available with the given choices, end the function
+    if len(target_ensemble)==0:
+        
+        return 'No data available for those choices'
+        
         
     # calculates the mean of the subsample
     target_mean = np.mean(target_ensemble["Psal rate of change"])
@@ -262,7 +279,7 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
     ax_hist.set_xlabel('PSU/hr')
     
     
-    label_coords = (0.65, 0.8)
+    label_coords = (0.70, 0.99)
     label_method = 'axes fraction' 
     
     anno = 'mean = '+str(round(float(target_mean),sigfigs=3))
@@ -277,19 +294,27 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
         
         anno += '\nall available data'
         
+    elif isinstance(deployment_in, list):
+        
+        anno += '\n'
+        
+        anno += '\n'.join(deployment_in)
+        
     else:
         
         anno += '\n'+deployment_in
             
-    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
+    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left")
     
     # returns the standard deviation of the subsample
     return target_std
     
 
 
-        
-        
+
+
+
+
         
         
         
diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index 6eda254..9efd849 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -225,15 +225,32 @@
 
 def std_by_depth_temp(top,bottom,deployment_in=None):
     
+    # if user incorrectly inputs depths, swap them and run the code
+    if top > bottom:
+        
+        top, bottom = bottom, top
+    
     if deployment_in == None:
     
         # subsamples sots_temp_ensemble_qc210 based on depth
         target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)]
         
+    elif isinstance(deployment_in, list):
+        
+        # subsamples sots_temp_ensemble_qc210 based on depth
+        target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210.Deployment.isin(deployment_in))]
+        
+        
     else:   
     
         # subsamples sots_temp_ensemble_qc210 based on depth
         target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)]
+    
+    # if not data is available with the given choices, end the function
+    if len(target_ensemble)==0:
+        
+        return 'No data available for those choices'
+        
         
     # calculates the mean of the subsample
     target_mean = np.mean(target_ensemble["Temp rate of change"])
@@ -259,7 +276,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None):
     ax_hist.set_xlabel('°C/hr')
     
     
-    label_coords = (0.65, 0.8)
+    label_coords = (0.70, 0.99)
     label_method = 'axes fraction' 
     
     anno = 'mean = '+str(round(float(target_mean),sigfigs=3))
@@ -274,11 +291,17 @@ def std_by_depth_temp(top,bottom,deployment_in=None):
         
         anno += '\nall available data'
         
+    elif isinstance(deployment_in, list):
+        
+        anno += '\n'
+        
+        anno += '\n'.join(deployment_in)
+        
     else:
         
         anno += '\n'+deployment_in
             
-    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8)
+    ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left")
     
     # returns the standard deviation of the subsample
     return target_std
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index c939838..cc7bc30 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -40,7 +40,7 @@
 start = time.time()
 
 # Set the working directory
-#os.chdir('/Users/tru050/Desktop/sofs6 test data')
+#os.chdir('‎/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/SOFS-5-2015')
 
 # Make a list of FV00 filenames
 fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc')

From 4317aff5f13bead9c778a4c68ec512b03436e61d Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 21 May 2020 13:16:29 +1000
Subject: [PATCH 46/59] Update temp_stat_plot.py

adds per sample functionality
---
 ocean_dp/qc/temp_stat_plot.py | 41 +++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index 9efd849..bc1020d 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -46,7 +46,7 @@
         deployments.append(x)
     
 # create a dataframe to store extract information
-sots_temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"])
+sots_temp_ensemble = pd.DataFrame(columns = ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"])
 
 # loops through all files in the directory
 for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
@@ -168,7 +168,7 @@
                     
                     
                     # combine information into an length x 4 dataframe
-                    nc_temp_ensemble = pd.DataFrame({"Temp rate of change":nc_dtemp_dtime,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
+                    nc_temp_ensemble = pd.DataFrame({"dTemp/dtime":nc_dtemp_dtime,"dTemp/dSample":nc_temp_diffs,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
                     
                     # append the current netcdf's dataframe to the sots_temp_ensemble
                     sots_temp_ensemble = sots_temp_ensemble.append(nc_temp_ensemble)
@@ -186,7 +186,7 @@
 sots_temp_ensemble_qc210 = sots_temp_ensemble[sots_temp_ensemble["QC"]<3]
 
 # calculates overall standard deviation
-std_total = np.std(sots_temp_ensemble_qc210["Temp rate of change"])
+std_time_total = np.std(sots_temp_ensemble_qc210["dTemp/dtime"])
 
 
 
@@ -199,7 +199,8 @@
     std_by_deployment_data.append(
         {
             'Deployment': i,
-            'STD': np.std(sots_temp_ensemble_qc210["Temp rate of change"][sots_temp_ensemble_qc210["Deployment"]==i]),
+            'STD time': np.std(sots_temp_ensemble_qc210["dTemp/dtime"][sots_temp_ensemble_qc210["Deployment"]==i]),
+            'STD sample': np.std(sots_temp_ensemble_qc210["dTemp/dSample"][sots_temp_ensemble_qc210["Deployment"]==i])
         }
     )
 
@@ -223,7 +224,23 @@
 # sensors with 500m <= nominal depth <= 10000m
 # =============================================================================
 
-def std_by_depth_temp(top,bottom,deployment_in=None):
+def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'):
+    
+    selection = ''
+    
+    if rate == 'time':
+        
+        selection = "dTemp/dtime"
+        
+    elif rate == 'sample':
+        
+        selection = "dTemp/dSample"
+        
+    else:
+        
+        return "incorrect rate specification"
+    
+    
     
     # if user incorrectly inputs depths, swap them and run the code
     if top > bottom:
@@ -253,10 +270,10 @@ def std_by_depth_temp(top,bottom,deployment_in=None):
         
         
     # calculates the mean of the subsample
-    target_mean = np.mean(target_ensemble["Temp rate of change"])
+    target_mean = np.mean(target_ensemble[selection])
             
     # calculates the standard deviation of the subsample
-    target_std = np.std(target_ensemble["Temp rate of change"])
+    target_std = np.std(target_ensemble[selection])
         
     # sets line thickness for plot
     line_thick = 1
@@ -265,7 +282,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None):
     ax_hist=plt.axes()
     
     # plots a histogram of the data selected
-    target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist)
+    target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist)
     
     # draws lines at the mean +- 3 STD on the histogram
     ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) 
@@ -273,7 +290,13 @@ def std_by_depth_temp(top,bottom,deployment_in=None):
     ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 
     
     # sets the x label
-    ax_hist.set_xlabel('°C/hr')
+    if rate == 'time':
+        
+         ax_hist.set_xlabel('°C/hr')
+        
+    elif rate == 'sample':
+        
+        ax_hist.set_xlabel('°C')
     
     
     label_coords = (0.70, 0.99)

From f6b597fe273c9b3cfd65ca00daea4dec3fc008b3 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 25 May 2020 15:29:52 +1000
Subject: [PATCH 47/59] update to stat plots

changed labels, added per sample functionality to psal plotter
---
 ocean_dp/plotting/batch-qc.pdf         | Bin 0 -> 208 bytes
 ocean_dp/plotting/plotQC.py            |   4 +--
 ocean_dp/qc/psal_stat_plot.py          |  42 +++++++++++++++++++------
 ocean_dp/qc/temp_stat_plot.py          |   9 +++++-
 ocean_dp/sots_processing_runthrough.py |   4 +--
 5 files changed, 44 insertions(+), 15 deletions(-)
 create mode 100644 ocean_dp/plotting/batch-qc.pdf

diff --git a/ocean_dp/plotting/batch-qc.pdf b/ocean_dp/plotting/batch-qc.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5e7a87baea6886313712bca3fae565bb0266a9f5
GIT binary patch
literal 208
zcmY!laB<T$)HC5yU2{ia^)4<$1p|frq%1BQ8wLG<#Prl+1tXwHkb-_lWkITfzH?$p
zVorX#f}I^#YF-LRp#_o#*NPJN;F83WR0R`=Cb#^&5(Q%jJ0P*7B(*3{!4$;h(hn%g
zPYzBkQHTcG<KhN1KeeJnLEkeMXr~jDc81cP3b6|M!5N7unR)37W)STW{z+M>$t4Qr
JV8ap40RR*-I}iW>

literal 0
HcmV?d00001

diff --git a/ocean_dp/plotting/plotQC.py b/ocean_dp/plotting/plotQC.py
index 3060759..979976a 100755
--- a/ocean_dp/plotting/plotQC.py
+++ b/ocean_dp/plotting/plotQC.py
@@ -67,11 +67,11 @@ def do_plot(fn):
     DS = xr.open_dataset(fn)
 
     ax1 = plt.subplot(2, 1, 1)
-    plt.plot(DS.TIME, DS.PAR)
+    plt.plot(DS.TIME, DS.TEMP)
     plt.title(DS.deployment_code + " - " + DS.instrument_model + ":" + DS.instrument_serial_number + " @ " + str(DS.instrument_nominal_depth), {'fontsize': 8})
 
     ax2 = plt.subplot(2, 1, 2, sharex=ax1)
-    aux = DS.PAR.ancillary_variables
+    aux = DS.TEMP.ancillary_variables
     a_vars = aux.split(" ")
     for f in sorted(set(a_vars)):
         print('aux var', f)
diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py
index 46999bc..cb9cbc7 100755
--- a/ocean_dp/qc/psal_stat_plot.py
+++ b/ocean_dp/qc/psal_stat_plot.py
@@ -48,7 +48,7 @@
         
         
 # create a dataframe to store extract information
-sots_psal_ensemble = pd.DataFrame(columns = ["PSAL rate of change","QC","Nominal depth","Deployment"])
+sots_psal_ensemble = pd.DataFrame(columns = ["dPsal/dtime","dPsal/dSample","QC","Nominal depth","Deployment"])
 
 
 # loops through all files in the directory
@@ -171,7 +171,7 @@
                     
                     
                     # combine information into an length x 4 dataframe
-                    nc_psal_ensemble = pd.DataFrame({"Psal rate of change":nc_dpsal_dtime,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
+                    nc_psal_ensemble = pd.DataFrame({"dPsal/dtime":nc_dpsal_dtime,"dPsal/dSample":nc_psal_diffs,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list})
                     
                     # append the current netcdf's dataframe to the sots_psal_ensemble
                     sots_psal_ensemble = sots_psal_ensemble.append(nc_psal_ensemble)
@@ -189,7 +189,7 @@
 sots_psal_ensemble_qc210 = sots_psal_ensemble[sots_psal_ensemble["QC"]<3]
 
 # calculates overall standard deviation
-std_total = np.std(sots_psal_ensemble_qc210["Psal rate of change"])
+std_total = np.std(sots_psal_ensemble_qc210["dPsal/dtime"])
 
 
 
@@ -202,7 +202,8 @@
     std_by_deployment_data.append(
         {
             'Deployment': i,
-            'STD': np.std(sots_psal_ensemble_qc210["Psal rate of change"][sots_psal_ensemble_qc210["Deployment"]==i]),
+            'STD': np.std(sots_psal_ensemble_qc210["dPsal/dtime"][sots_psal_ensemble_qc210["Deployment"]==i]),
+            'STD sample': np.std(sots_psal_ensemble_qc210["dPsal/dSample"][sots_psal_ensemble_qc210["Deployment"]==i])
         }
     )
 
@@ -226,7 +227,23 @@
 # sensors with 500m <= nominal depth <= 10000m
 # =============================================================================
 
-def std_by_depth_psal(top,bottom,deployment_in=None):
+def std_by_depth_psal(top,bottom,deployment_in=None,rate='time'):
+    
+    selection = ''
+    
+    if rate == 'time':
+        
+        selection = "dPsal/dtime"
+        
+    elif rate == 'sample':
+        
+        selection = "dPsal/dSample"
+        
+    else:
+        
+        return "incorrect rate specification"
+    
+    
     
     # if user incorrectly inputs depths, swap them and run the code
     if top > bottom:
@@ -256,10 +273,10 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
         
         
     # calculates the mean of the subsample
-    target_mean = np.mean(target_ensemble["Psal rate of change"])
+    target_mean = np.mean(target_ensemble[selection])
             
     # calculates the standard deviation of the subsample
-    target_std = np.std(target_ensemble["Psal rate of change"])
+    target_std = np.std(target_ensemble[selection])
         
     # sets line thickness for plot
     line_thick = 1
@@ -268,7 +285,7 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
     ax_hist=plt.axes()
     
     # plots a histogram of the data selected
-    target_ensemble.hist(column="Psal rate of change",bins=100,log=True,ax=ax_hist)
+    target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist)
     
     # draws lines at the mean +- 3 STD on the histogram
     ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) 
@@ -276,7 +293,13 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
     ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) 
     
     # sets the x label
-    ax_hist.set_xlabel('PSU/hr')
+    if rate == 'time':
+        
+         ax_hist.set_xlabel('PSU/hr')
+        
+    elif rate == 'sample':
+        
+        ax_hist.set_xlabel('PSU/sample')
     
     
     label_coords = (0.70, 0.99)
@@ -314,7 +337,6 @@ def std_by_depth_psal(top,bottom,deployment_in=None):
 
 
 
-
         
         
         
diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py
index bc1020d..5b31a82 100755
--- a/ocean_dp/qc/temp_stat_plot.py
+++ b/ocean_dp/qc/temp_stat_plot.py
@@ -28,6 +28,12 @@
 from sigfig import round
 import pandas as pd
 
+import warnings
+import scipy.stats as st
+import statsmodels as sm
+import matplotlib
+
+
 ############################# Data extraction ################################
 
 # creates an empty array to store the names of the SOTS deployments
@@ -296,7 +302,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'):
         
     elif rate == 'sample':
         
-        ax_hist.set_xlabel('°C')
+        ax_hist.set_xlabel('°C/sample')
     
     
     label_coords = (0.70, 0.99)
@@ -351,6 +357,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'):
 
 
 
+
 
 
             
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index cc7bc30..c389d27 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -70,13 +70,13 @@
     global_range.global_range(ncfile,'TEMP',40,-2)
 
 # Rate of change
-rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',10)
+rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',3.36)
 
 # Spike
 spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP'])
 
 # Flatline
-flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=10)
+flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=20)
 
 # Check qc process has worked
 fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP'])

From d3839b146110f506a63722f2592e190f3b819b60 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Wed, 27 May 2020 16:53:31 +1000
Subject: [PATCH 48/59] Create add_density.py

---
 ocean_dp/processing/add_density.py | 78 ++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100755 ocean_dp/processing/add_density.py

diff --git a/ocean_dp/processing/add_density.py b/ocean_dp/processing/add_density.py
new file mode 100755
index 0000000..6e56a53
--- /dev/null
+++ b/ocean_dp/processing/add_density.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2020 Ben Weeding and Peter Jansen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+from netCDF4 import Dataset
+import sys
+import gsw
+import numpy as np
+from datetime import datetime
+
+# add density to a data file with TEMP, PSAL, PRES variables, many assumptions are made about the input file
+# based on Peter Jansen's addPSAL.py, using TEOS-10
+
+def add_density(netCDFfile):
+    
+    # loads the netcdf file
+    ds = Dataset(netCDFfile, 'a')
+    
+    if 'DENSITY' in list(ds.variables):
+    
+        ds.close()
+        
+        return "file already contains density"
+
+    # extracts the variables from the netcdf
+    var_temp = ds.variables["TEMP"]
+    var_psal = ds.variables["PSAL"]
+    var_pres = ds.variables["PRES"]
+    var_lon = ds.variables["LONGITUDE"]
+    var_lat = ds.variables["LATITUDE"]
+
+    # extracts the data from the variables
+    t = var_temp[:]
+    psal = var_psal[:]
+    p = var_pres[:]
+    lon = var_lon[:]
+    lat = var_lat[:]
+    
+    # calculates absolute salinity
+    SA = gsw.SA_from_SP(psal, p, lon, lat)
+    
+    # calculates conservative temperature
+    CT = gsw.CT_from_t(SA, t, p)
+    
+    # calculates density
+    density = gsw.rho(SA, CT, p)
+    
+
+    ncVarOut = ds.createVariable("DENSITY", "f4", ("TIME",), fill_value=np.nan, zlib=True)  # fill_value=nan otherwise defaults to max
+    ncVarOut[:] = density
+    ncVarOut.units = "kg/m^3"
+    ncVarOut.comment = "calculated using gsw-python https://teos-10.github.io/GSW-Python/index.html"
+
+    # update the history attribute
+    try:
+        hist = ds.history + "\n"
+    except AttributeError:
+        hist = ""
+
+    ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added DENSITY from TEMP, PSAL, PRES, LAT, LON")
+
+    ds.close()
+
+
+if __name__ == "__main__":
+    add_density(sys.argv[1])

From 031fd0e9e0d022b697448cef3967c9784b8ac909 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 28 May 2020 16:11:27 +1000
Subject: [PATCH 49/59] Making Pandas

Don't know what zoos are complaining about
---
 ocean_dp/file_name/find_file_with.py |  90 +++++++++++++++++++
 ocean_dp/plotting/density_plot.py    |  25 ++++++
 ocean_dp/plotting/panda_maker.py     | 126 +++++++++++++++++++++++++++
 ocean_dp/processing/add_density.py   |   4 +-
 4 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100755 ocean_dp/file_name/find_file_with.py
 create mode 100755 ocean_dp/plotting/density_plot.py
 create mode 100755 ocean_dp/plotting/panda_maker.py

diff --git a/ocean_dp/file_name/find_file_with.py b/ocean_dp/file_name/find_file_with.py
new file mode 100755
index 0000000..59225b3
--- /dev/null
+++ b/ocean_dp/file_name/find_file_with.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python3
+
+# raw2netCDF
+# Copyright (C) 2019 Peter Jansen
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import glob
+
+import sys
+import re
+
+from netCDF4 import Dataset
+
+
+def find_files_pattern(file_pattern):
+    match_files = []
+    files = glob.glob(file_pattern)
+
+    match_files.extend(files)
+    return match_files
+
+def find_global(files, attribute, regexp):
+
+    match_files = []
+    #print("find", file_pattern, files)
+    for f in files:
+        #print("check file", f)
+        ds = Dataset(f, 'r')
+        if attribute in ds.ncattrs():
+            if re.match(regexp, ds.getncattr(attribute)):
+                match_files.append(f)
+        ds.close()
+
+    return match_files
+
+
+def find_variable(files, variable):
+
+    match_files = []
+    for f in files:
+        #print("check file", f)
+        ds = Dataset(f, 'r')
+        if variable in ds.variables:
+            match_files.append(f)
+        ds.close()
+
+    return match_files
+
+
+def find_variable_attribute(files, attribute, value):
+
+    match_files = []
+    for f in files:
+        #print("check file", f)
+        ds = Dataset(f, 'r')
+        nv = {attribute : value}
+        find = ds.get_variables_by_attributes(**nv)
+        if len(find) > 0:
+            match_files.append(f)
+        ds.close()
+
+    return match_files
+
+
+if __name__ == "__main__":
+    fns = []
+    if sys.argv[1] == '-v':
+        files = find_files_pattern(sys.argv[3])
+        fns = find_variable(files, variable=sys.argv[2])
+    elif sys.argv[1] == '-a':
+        files = find_files_pattern(sys.argv[4])
+        fns = find_variable_attribute(files, attribute=sys.argv[2], value=sys.argv[3])
+    else:
+        files = find_files_pattern(sys.argv[4])
+        fns = find_global(files, attribute=sys.argv[1], regexp=sys.argv[2])
+
+    for f in fns:
+        print(f)
\ No newline at end of file
diff --git a/ocean_dp/plotting/density_plot.py b/ocean_dp/plotting/density_plot.py
new file mode 100755
index 0000000..a798b02
--- /dev/null
+++ b/ocean_dp/plotting/density_plot.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import os
+
+sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/file_name')
+
+import find_file_with
+
+path = "/Users/Tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"
+
+sots_files = find_file_with.find_files_pattern(os.path.join(path, "IMOS*FV00*.nc"))
\ No newline at end of file
diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py
new file mode 100755
index 0000000..b21add7
--- /dev/null
+++ b/ocean_dp/plotting/panda_maker.py
@@ -0,0 +1,126 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+import pandas as pd
+
+import warnings
+import scipy.stats as st
+import statsmodels as sm
+import matplotlib
+
+# this function creates a pandas Datatable object, searching through all the 
+# netcdf files in the directory given, containing all the variables specified
+
+# "/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"
+
+# ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"]
+
+#  qc selection!!!
+
+def panda_maker(dir_spec,var_list,qc_lim=2):
+
+    # creates an empty array to store the names of the SOTS deployments
+    deployments = []
+    
+    checked_files = []
+    
+    processed_files = []
+    
+    # loops through all the folders and files contained in the folder
+    for x in os.listdir(dir_spec):
+        
+        # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments
+        if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x):
+            
+            deployments.append(x)
+        
+        
+        
+    # create a dataframe to store extract information
+    total_df = pd.DataFrame(columns = var_list)
+    
+    # add deployment code to the dataframe
+    total_df.insert(len(var_list),'Deployment code',[])
+    
+    # loops through all files in the directory
+    for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"):
+        
+        for fname in files:
+            
+            # append the filename to the list of checked files
+            checked_files.append(fname)
+            
+            # for each netcdf file labelled as FV01 and containing a deployment in its name
+            if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments):
+            
+                # print the filename
+                print(fname)  
+                
+                # open the file
+                nc = Dataset(os.path.join(root,fname), mode = 'r')
+                
+                # check file contains all the specified variables and the time format is correct
+                if (all(ele in list(nc.variables) for ele in var_list)) & (nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC'):
+                        
+                    # create a current dataframe for the netcdf file, to be appended to the overall dataframe
+                    cur_df = pd.DataFrame(columns=var_list)
+                    
+                    # create a qc vector for the netcdf file
+                    cur_qc = np.zeros(nc.variables["TIME"].shape)
+
+                    for cur_var in var_list:
+                        
+                        if np.array(nc.variables[cur_var]).size == 1:
+                            
+                            filling = np.ones(nc.variables["TIME"].shape) * np.array(nc.variables[cur_var])
+                            
+                        else:
+                            
+                            filling = np.array(nc.variables[cur_var][:])
+                            
+                        if cur_var + '_quality_control' in list(nc.variables):
+                            
+                            cur_qc = np.maximum(cur_qc,np.array(nc.variables[cur_var + '_quality_control']))        
+                            
+                            
+                        cur_df[cur_var] = filling
+                    
+                    cur_df['Deployment code'] = [nc.deployment_code] * len(np.array(nc.variables['TIME']))
+
+                    # append the current netcdf's dataframe to the sots_temp_ensemble
+                    total_df = total_df.append(cur_df.iloc[np.where(cur_qc<=qc_lim)])
+                    
+                    # append the filename to the list of processed files
+                    processed_files.append(fname)
+                        
+                        
+                nc.close()
+                
+                
+    return total_df
+                        
\ No newline at end of file
diff --git a/ocean_dp/processing/add_density.py b/ocean_dp/processing/add_density.py
index 6e56a53..9320ddf 100755
--- a/ocean_dp/processing/add_density.py
+++ b/ocean_dp/processing/add_density.py
@@ -57,8 +57,10 @@ def add_density(netCDFfile):
     # calculates density
     density = gsw.rho(SA, CT, p)
     
-
+    # generates a new variable 'DENSITY' in the netcdf
     ncVarOut = ds.createVariable("DENSITY", "f4", ("TIME",), fill_value=np.nan, zlib=True)  # fill_value=nan otherwise defaults to max
+    
+    # assigns the calculated densities to the DENSITY variable, sets the units as kg/m^3, and comments on the variable's origin
     ncVarOut[:] = density
     ncVarOut.units = "kg/m^3"
     ncVarOut.comment = "calculated using gsw-python https://teos-10.github.io/GSW-Python/index.html"

From 3d32c27082ae4fa98c06d7d7427a88aab4eb19b2 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 28 May 2020 16:19:49 +1000
Subject: [PATCH 50/59] Update panda_maker.py

---
 ocean_dp/plotting/panda_maker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py
index b21add7..5c3aa70 100755
--- a/ocean_dp/plotting/panda_maker.py
+++ b/ocean_dp/plotting/panda_maker.py
@@ -121,6 +121,8 @@ def panda_maker(dir_spec,var_list,qc_lim=2):
                         
                 nc.close()
                 
+    total_df = total_df.reset_index()
+                
                 
     return total_df
                         
\ No newline at end of file

From 11b897320e45f92d7a6b1638a931254b0864d879 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 28 May 2020 16:22:22 +1000
Subject: [PATCH 51/59] Update panda_maker.py

---
 ocean_dp/plotting/panda_maker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py
index 5c3aa70..2fc9bfb 100755
--- a/ocean_dp/plotting/panda_maker.py
+++ b/ocean_dp/plotting/panda_maker.py
@@ -121,7 +121,7 @@ def panda_maker(dir_spec,var_list,qc_lim=2):
                         
                 nc.close()
                 
-    total_df = total_df.reset_index()
+    total_df = total_df.reset_index(drop=True)
                 
                 
     return total_df

From d75303b06ab44354b656b30b1f760f6574092c27 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 29 May 2020 13:50:12 +1000
Subject: [PATCH 52/59] Create panda_merger.py

---
 ocean_dp/processing/panda_merger.py | 73 +++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100755 ocean_dp/processing/panda_merger.py

diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py
new file mode 100755
index 0000000..1425f5c
--- /dev/null
+++ b/ocean_dp/processing/panda_merger.py
@@ -0,0 +1,73 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import numpy.ma as ma
+import sys
+from netCDF4 import Dataset, num2date
+from dateutil import parser
+from datetime import datetime as dt
+from datetime import timedelta 
+import numpy as np
+import argparse
+import glob
+import pytz
+import os
+import matplotlib.pyplot as plt
+from matplotlib import colors
+from matplotlib.ticker import PercentFormatter
+from sigfig import round
+import pandas as pd
+
+nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r')
+
+nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r')
+
+df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP':np.array(nc1.variables['TEMP'][:])})
+
+df1_time = np.array(nc1.variables['TIME'][:])
+
+df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP':np.array(nc2.variables['TEMP'][:])})
+
+# convert datenums to datetimes
+
+pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
+
+df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
+
+df1a=df1.set_index('TIME')
+
+# resample hourly 
+
+df1h=df1a.resample('H',base=0.5).mean()
+
+df1h.index = df1h.index + pd.Timedelta('30 min')
+
+
+
+# merge
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 1c07d8cc57efd37a82324a47ed9ea9081fbc1a85 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 29 May 2020 14:27:05 +1000
Subject: [PATCH 53/59] Update panda_merger.py

---
 ocean_dp/processing/panda_merger.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py
index 1425f5c..6f69822 100755
--- a/ocean_dp/processing/panda_merger.py
+++ b/ocean_dp/processing/panda_merger.py
@@ -30,33 +30,40 @@
 from sigfig import round
 import pandas as pd
 
+# import two netcdf
 nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r')
 
 nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r')
 
-df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP':np.array(nc1.variables['TEMP'][:])})
+# convert their time and temp data into dataframes
+df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_200':np.array(nc1.variables['TEMP'][:])})
 
-df1_time = np.array(nc1.variables['TIME'][:])
+df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_45':np.array(nc2.variables['TEMP'][:])})
 
-df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP':np.array(nc2.variables['TEMP'][:])})
+# convert the times from days since 01-01-1950 to a datetime object
+df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
 
-# convert datenums to datetimes
+df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1)
 
-pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
+# index the dataframes by time
+df1=df1.set_index('TIME')
 
-df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
+df2=df2.set_index('TIME')
 
-df1a=df1.set_index('TIME')
 
-# resample hourly 
+# resample the data, calculating the mean over hourly periods, starting on the half hour
+df1=df1.resample('H',base=0.5).mean()
 
-df1h=df1a.resample('H',base=0.5).mean()
+df2=df2.resample('H',base=0.5).mean()
 
-df1h.index = df1h.index + pd.Timedelta('30 min')
+# reset the labels so they read the hour in the centre of the averaging period
+df1.index = df1.index + pd.Timedelta('30 min')
 
+df2.index = df2.index + pd.Timedelta('30 min')
 
 
-# merge
+# combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp
+total_df = pd.concat([df1,df2], join='outer', axis=1)
 
 
 

From 19a80341ed3d41e60d18b33255c1c47e758a2acc Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 29 May 2020 14:28:23 +1000
Subject: [PATCH 54/59] Update panda_merger.py

---
 ocean_dp/processing/panda_merger.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py
index 6f69822..69de3f6 100755
--- a/ocean_dp/processing/panda_merger.py
+++ b/ocean_dp/processing/panda_merger.py
@@ -20,14 +20,7 @@
 from datetime import datetime as dt
 from datetime import timedelta 
 import numpy as np
-import argparse
-import glob
-import pytz
-import os
-import matplotlib.pyplot as plt
-from matplotlib import colors
-from matplotlib.ticker import PercentFormatter
-from sigfig import round
+
 import pandas as pd
 
 # import two netcdf

From fe99b4eebf283ba9faa2c1f3cab31be1d4724147 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Fri, 29 May 2020 15:46:23 +1000
Subject: [PATCH 55/59] Update panda_merger.py

---
 ocean_dp/processing/panda_merger.py | 38 ++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py
index 69de3f6..495b906 100755
--- a/ocean_dp/processing/panda_merger.py
+++ b/ocean_dp/processing/panda_merger.py
@@ -20,7 +20,7 @@
 from datetime import datetime as dt
 from datetime import timedelta 
 import numpy as np
-
+import glob
 import pandas as pd
 
 # import two netcdf
@@ -29,9 +29,9 @@
 nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r')
 
 # convert their time and temp data into dataframes
-df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_200':np.array(nc1.variables['TEMP'][:])})
+df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])})
 
-df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_45':np.array(nc2.variables['TEMP'][:])})
+df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])})
 
 # convert the times from days since 01-01-1950 to a datetime object
 df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
@@ -59,7 +59,37 @@
 total_df = pd.concat([df1,df2], join='outer', axis=1)
 
 
-
+files = glob.glob('*FV00*.nc')
+
+var_name = 'TEMP'
+
+def panda_combine(files,var_name):
+    
+    total_df = pd.DataFrame({'A' : []})
+    
+    # make a sorting index for columns from nominal depths
+    
+    for cur_file in files:
+        
+        cur_nc = Dataset(cur_file,mode='r')
+        
+        cur_df = pd.DataFrame({'TIME':np.array(cur_nc.variables['TIME'][:]),var_name+'_'+str(cur_nc.variables['NOMINAL_DEPTH'][0]):np.array(cur_nc.variables[var_name][:])})
+
+        cur_df['TIME']=pd.to_timedelta(cur_df['TIME'],unit='D')+dt(1950,1,1)
+            
+        cur_df = cur_df.set_index('TIME')
+        
+        cur_df = cur_df.resample('H',base=0.5).mean()
+        
+        cur_df.index = cur_df.index + pd.Timedelta('30 min')
+        
+        total_df = pd.concat([total_df,cur_df], join='outer', axis=1)
+        
+        print(cur_file)
+        
+        print(len(total_df))
+            
+    return total_df
 
 
 

From afeb0670d6a2b8c809fb2c01f54fd96d90b14971 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Mon, 1 Jun 2020 15:28:35 +1000
Subject: [PATCH 56/59] beginning of netcdf to panda converter

---
 ocean_dp/processing/netcdf_to_df.py | 94 +++++++++++++++++++++++++++++
 ocean_dp/processing/panda_merger.py | 93 ++++++++++++++++++----------
 2 files changed, 155 insertions(+), 32 deletions(-)
 create mode 100755 ocean_dp/processing/netcdf_to_df.py

diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py
new file mode 100755
index 0000000..dc1d6fa
--- /dev/null
+++ b/ocean_dp/processing/netcdf_to_df.py
@@ -0,0 +1,94 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from netCDF4 import Dataset, num2date
+import pandas as pd
+import numpy as np
+from datetime import datetime as dt
+
+
+# =============================================================================
+# Returns a list of the time series variables in a IMOS format 
+# netcdf file. Takes an open netcdf as its argument.
+# =============================================================================
+def var_selector_inc_time(nc,qc=False,):
+    
+    x = [x for x in list(nc.variables) if ('_quality_control_' not in x) & (nc.variables[x].shape!=())]
+        
+    return x    
+
+# =============================================================================
+# 
+# =============================================================================
+def netcdf_to_df(target_file):
+    
+    # open the inputted netcdf
+    nc = Dataset(target_file,mode='r')
+    
+    # creates a the list of variables to transfer to the dataframe
+    vars_to_transfer = var_selector_inc_time(nc)
+    
+    # creates the dataframe with column labels
+    df = pd.DataFrame(columns = vars_to_transfer)
+    
+    # sorts the columns alphabetically, with the relevant qc variable following each timeseries variable
+    df.sort_index(axis=1, inplace=True)
+    
+    # fill the dataframe from the netcdf, variable by variable
+    for cur_var in vars_to_transfer:
+        
+        df[cur_var] = np.array(nc.variables[cur_var])
+        
+    # convert time into a datetime object, this is optional, and not needed to continue in the IMOS format
+    #df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1)
+    
+    # index the dataframe by time        
+    df = df.set_index('TIME')
+    
+    # extract the column names
+    col_names = list(df.columns)
+            
+    # append the nominal depth to all column names
+    df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names]        
+           
+        
+        
+        
+        
+        
+        
+        
+        
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+        
+    
+    
+
diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py
index 495b906..13a789c 100755
--- a/ocean_dp/processing/panda_merger.py
+++ b/ocean_dp/processing/panda_merger.py
@@ -22,50 +22,34 @@
 import numpy as np
 import glob
 import pandas as pd
+import re
 
-# import two netcdf
-nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r')
 
-nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r')
 
-# convert their time and temp data into dataframes
-df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])})
 
-df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])})
-
-# convert the times from days since 01-01-1950 to a datetime object
-df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
-
-df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1)
-
-# index the dataframes by time
-df1=df1.set_index('TIME')
-
-df2=df2.set_index('TIME')
-
-
-# resample the data, calculating the mean over hourly periods, starting on the half hour
-df1=df1.resample('H',base=0.5).mean()
-
-df2=df2.resample('H',base=0.5).mean()
-
-# reset the labels so they read the hour in the centre of the averaging period
-df1.index = df1.index + pd.Timedelta('30 min')
-
-df2.index = df2.index + pd.Timedelta('30 min')
+files = glob.glob('*FV00*.nc')
 
+var_name = 'TEMP'
 
-# combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp
-total_df = pd.concat([df1,df2], join='outer', axis=1)
+def depth_from_file(file_in):
+    
+    result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0])
+    
+    return result
 
+def var_selector(file):
+    
+    [x for x in var_list if (x!='TIME') & ('_quality_control' not in x) & (nc.variables[x].shape!=())]
+        
+    return x    
 
-files = glob.glob('*FV00*.nc')
 
-var_name = 'TEMP'
 
 def panda_combine(files,var_name):
     
-    total_df = pd.DataFrame({'A' : []})
+    files.sort(key=depth_from_file)
+    
+    total_df = pd.DataFrame({'dummy' : []})
     
     # make a sorting index for columns from nominal depths
     
@@ -88,12 +72,57 @@ def panda_combine(files,var_name):
         print(cur_file)
         
         print(len(total_df))
+        
+    total_df.drop(['dummy'],axis=1,inplace=True)
             
     return total_df
 
 
+result = re.findall(r'(?<=-)\w+(?=m_END)', cur_file)
+
+
+
+
+
+
+# =============================================================================
+# Old proof of concept code
+# =============================================================================
+
+# # import two netcdf
+# nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r')
+
+# nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r')
+
+# # convert their time and temp data into dataframes
+# df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])})
+
+# df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])})
+
+# # convert the times from days since 01-01-1950 to a datetime object
+# df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1)
+
+# df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1)
+
+# # index the dataframes by time
+# df1=df1.set_index('TIME')
+
+# df2=df2.set_index('TIME')
+
+
+# # resample the data, calculating the mean over hourly periods, starting on the half hour
+# df1=df1.resample('H',base=0.5).mean()
+
+# df2=df2.resample('H',base=0.5).mean()
+
+# # reset the labels so they read the hour in the centre of the averaging period
+# df1.index = df1.index + pd.Timedelta('30 min')
+
+# df2.index = df2.index + pd.Timedelta('30 min')
 
 
+# # combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp
+# total_df = pd.concat([df1,df2], join='outer', axis=1)
 
 
 

From b43195f3ff55166fec8847ce5980cfeb035011a8 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Tue, 2 Jun 2020 16:13:46 +1000
Subject: [PATCH 57/59] netcdf_to_df progress

---
 ocean_dp/processing/netcdf_to_df.py |  87 +++++++++++++++++--
 ocean_dp/qc/in_out_water.py         | 127 +++++++++++++++++-----------
 2 files changed, 158 insertions(+), 56 deletions(-)

diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py
index dc1d6fa..e86b158 100755
--- a/ocean_dp/processing/netcdf_to_df.py
+++ b/ocean_dp/processing/netcdf_to_df.py
@@ -49,22 +49,93 @@ def netcdf_to_df(target_file):
     # fill the dataframe from the netcdf, variable by variable
     for cur_var in vars_to_transfer:
         
-        df[cur_var] = np.array(nc.variables[cur_var])
-        
-    # convert time into a datetime object, this is optional, and not needed to continue in the IMOS format
-    #df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1)
+        df[cur_var] = np.array(nc.variables[cur_var])    
     
-    # index the dataframe by time        
-    df = df.set_index('TIME')
+    # store deployment times in attributes
+    df.attrs['time_deployment_start'] = nc.time_deployment_start
+    df.attrs['time_deployment_end'] = nc.time_deployment_end
     
     # extract the column names
     col_names = list(df.columns)
             
     # append the nominal depth to all column names
-    df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names]        
-           
+    df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x if 'TIME' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names]        
+    
+    nc.close()
+       
+    return df
+    
+    
+# =============================================================================
+# Takes 
+# =============================================================================
+        
+def combine_df(target_dfs):
+    
+    # for each of the dataframes in the list provided
+    for cur_df in target_dfs:
+        
+        # make a copy of the current dataframe to modify and combine
+        df = cur_df.copy()
+        
+        # convert the IMOS format times to datetime 
+        df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1)
+        
+        # index the dataframe by time - for some reason this makes the df very slow to visually open and navigate!?  
+        df = df.set_index('TIME')
+        
+        # extract and convert deployment times to datetime 
+        start_time = dt.strptime(df.attrs['time_deployment_start'],'%Y-%m-%dT%H:%M:%SZ')
+        end_time = dt.strptime(df.attrs['time_deployment_end'],'%Y-%m-%dT%H:%M:%SZ')
+        
+        # trim the df to only include in water data
+        df = df.drop(df[(df.index < start_time) | (df.index > end_time)].index)
+        
+        # resamples using the max method, to create a df of the correct dimensions to fill
+        df_to_fill = df.resample('H',base=0.5).max()
+        
+        
+        # gets list of column names
+        col_names = list(df.columns)
         
+        # makes a list of non qc column names
+        col_names_no_qc = [x for x in col_names if 'quality_control' not in x]
         
+        # for each of the time series data columns
+        for cur_col in col_names_no_qc:
+            
+            # sets the value of non qc data to nan if the corresponding qc value is not satisfactory (0,1,2,7 at the moment)
+            df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan
+            
+            # extracts the time series data
+            dS = pd.Series(df[cur_col])
+            
+            # makes a copy for bin counting
+            dS_1s = dS.copy()
+            
+            dS_1s[:] = 1
+            
+            # resamples the series, interpoling linearly
+            dS_resampled = dS.resample('H',base=0.5).interpolate()
+            
+            # count how many data points are in each shoulder bin
+            dS_bin_counts = dS_1s.resample('H',base=0.5).sum()
+            
+            # fill the interpolated data back into the dataframe
+            df_to_fill[cur_col] = dS_resampled
+            
+            # give any interpolated point without any data within its hour window a qc code of 7
+            df_to_fill.loc[dS_bin_counts==0,[cur_col+'_quality_control']] = 7
+            
+        # shift the timestamps to the middle of the hour sampling period
+        df_to_fill.index = df_to_fill.index + pd.Timedelta('30 min')
+        
+
+        
+        
+        
+
+            
         
         
         
diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py
index e8635ed..39da3a7 100644
--- a/ocean_dp/qc/in_out_water.py
+++ b/ocean_dp/qc/in_out_water.py
@@ -18,73 +18,104 @@
 
 from netCDF4 import Dataset, num2date
 import sys
-
+from datetime import datetime
 import numpy as np
 from dateutil import parser
 import pytz
 import os
 
-# flag out of water as QC value 7 (not_deployed), with wise leave as 0
+# flag out of water as QC value 6 (not_deployed), with wise leave as 0
 
 
 def in_out_water(netCDFfile, var_name=None):
-    ds = Dataset(netCDFfile, 'a')
-
-    nc_vars = ds.variables
-    to_add = []
-    if var_name:
-        to_add.append(var_name)
-    else:
-        for v in nc_vars:
-            #print (vars[v].dimensions)
-            if v != 'TIME':
-                to_add.append(v)
-
-    time_var = nc_vars["TIME"]
-    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
-
-    time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
-    time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
 
-    print('deployment time', time_deploy)
+    out_file = []
+
+    for fn in netCDFfile:
+        ds = Dataset(fn, 'a')
+
+        nc_vars = ds.variables
+        to_add = []
+        if var_name:
+            to_add.append(var_name)
+        else:
+            for v in nc_vars:
+                if "TIME" in nc_vars[v].dimensions:
+                    #print (vars[v].dimensions)
+                    if v != 'TIME':
+                        to_add.append(v)
+            # remove any anx variables from the list
+            for v in nc_vars:
+                if 'ancillary_variables' in nc_vars[v].ncattrs():
+                    remove = nc_vars[v].getncattr('ancillary_variables').split(' ')
+                    print("remove ", remove)
+                    for r in remove:
+                        to_add.remove(r)
+
+        time_var = nc_vars["TIME"]
+        time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+
+        time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
+        time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
+
+        print('file', fn)
+        print('deployment time', time_deploy)
+
+        print('var to add', to_add)
+
+        # create a mask for the time range
+        mask = (time <= time_deploy) | (time >= time_recovery)
+
+        for v in to_add:
+            print("var", v, ' dimensions ', nc_vars[v].dimensions)
+
+            ncVarOut = nc_vars[v + "_quality_control"]
+            ncVarOut[mask] = 6
+
+            # create a qc variable just for this test flags
+            if v + "_quality_control_io" in ds.variables:
+                ncVarOut = ds.variables[v + "_quality_control_io"]
+                ncVarOut[:] = 0
+            else:
+                ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
 
-    print(to_add)
+            ncVarOut[:] = 0
+            ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name
+            try:
+                ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag"
+            except AttributeError:
+                pass
 
-    # create a mask for the time range
-    mask = (time <= time_deploy) | (time >= time_recovery)
+            ncVarOut.quality_control_conventions = "IMOS standard flags"
+            ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+            ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+            ncVarOut.comment = 'data flagged not deployed (6) when out of water'
 
-    for v in to_add:
-        if "TIME" in nc_vars[v].dimensions:
-            if v.endswith("_quality_control"):
-                print("QC time dim ", v)
+            ncVarOut[mask] = 6
+            # calculate the number of points marked as bad_data
+            marked = np.zeros_like(ncVarOut)
+            marked[mask] = 1
+            count = sum(marked)
 
-                ncVarOut = nc_vars[v]
-                ncVarOut[mask] = 7
-            else:
-                # create a qc variable just for this test flags
-                if v + "_quality_control_io" in ds.variables:
-                    ncVarOut = ds.variables[v + "_quality_control_io"]
-                else:
-                    ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-                ncVarOut[:] = np.zeros(nc_vars[v].shape)
-                ncVarOut.long_name = "quality flag for " + v
-                ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
-                ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+        ds.file_version = "Level 1 - Quality Controlled Data"
+        # update the history attribute
+        try:
+            hist = ds.history + "\n"
+        except AttributeError:
+            hist = ""
 
-                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
-                ncVarOut[mask] = 7
-        
-        ds.variables[v + "_quality_control"][:] = np.maximum(ds.variables[v + "_quality_control_io"][:],ds.variables[v + "_quality_control"][:])
+        ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :  ' + ' marked ' + str(int(count)))
 
-    ds.file_version = "Level 1 - Quality Controlled Data"
+        ds.close()
 
-    ds.close()
+        out_file.append(fn)
 
-    return netCDFfile
+    return out_file
 
 
 if __name__ == "__main__":
     if len(sys.argv) > 2 & sys.argv[1].startswith('-'):
-        in_out_water(sys.argv[2], var_name=sys.argv[1][1:])
+        in_out_water(sys.argv[2:], var_name=sys.argv[1][1:])
     else:
-        in_out_water(sys.argv[1])
\ No newline at end of file
+        in_out_water(sys.argv[1:])
\ No newline at end of file

From fe0ab111fbe2b157127cfada96a2704db54dc9ff Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 11 Jun 2020 16:22:27 +1000
Subject: [PATCH 58/59] Various updates after convo with TT, not complete

---
 ocean_dp/processing/netcdf_to_df.py    |  54 +++++++++-
 ocean_dp/qc/in_out_water.py            | 144 ++++++++++++-------------
 ocean_dp/qc/qc_checker.py              |   7 ++
 ocean_dp/sots_processing_runthrough.py |   5 +
 4 files changed, 134 insertions(+), 76 deletions(-)

diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py
index e86b158..4a409a4 100755
--- a/ocean_dp/processing/netcdf_to_df.py
+++ b/ocean_dp/processing/netcdf_to_df.py
@@ -17,7 +17,8 @@
 import pandas as pd
 import numpy as np
 from datetime import datetime as dt
-
+import glob
+import re
 
 # =============================================================================
 # Returns a list of the time series variables in a IMOS format 
@@ -54,6 +55,7 @@ def netcdf_to_df(target_file):
     # store deployment times in attributes
     df.attrs['time_deployment_start'] = nc.time_deployment_start
     df.attrs['time_deployment_end'] = nc.time_deployment_end
+    df.attrs['nominal_depth'] = nc.variables['NOMINAL_DEPTH'][0]
     
     # extract the column names
     col_names = list(df.columns)
@@ -72,6 +74,8 @@ def netcdf_to_df(target_file):
         
 def combine_df(target_dfs):
     
+    total_df = pd.DataFrame({'dummy' : []})
+    
     # for each of the dataframes in the list provided
     for cur_df in target_dfs:
         
@@ -91,8 +95,11 @@ def combine_df(target_dfs):
         # trim the df to only include in water data
         df = df.drop(df[(df.index < start_time) | (df.index > end_time)].index)
         
+        # remove data with bad qc instead of setting to nan later in process, let resample do the work?
+        # but what if psal is bad but temp is good?? Need to think on this.
+        
         # resamples using the max method, to create a df of the correct dimensions to fill
-        df_to_fill = df.resample('H',base=0.5).max()
+        df_to_fill = df.resample('H',base=0.5).min()
         
         
         # gets list of column names
@@ -105,7 +112,9 @@ def combine_df(target_dfs):
         for cur_col in col_names_no_qc:
             
             # sets the value of non qc data to nan if the corresponding qc value is not satisfactory (0,1,2,7 at the moment)
-            df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan
+            #df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan
+            # CAUSING UNEXPECTED NANS - FIX
+            df.loc[df[cur_col+'_quality_control'].isin([3,4,6,9]) , cur_col] = np.nan
             
             # extracts the time series data
             dS = pd.Series(df[cur_col])
@@ -116,7 +125,8 @@ def combine_df(target_dfs):
             dS_1s[:] = 1
             
             # resamples the series, interpoling linearly
-            dS_resampled = dS.resample('H',base=0.5).interpolate()
+            dS_resampled = dS.resample('H',base=0.5).interpolate(method='index',axis=0,limit=1000000)
+
             
             # count how many data points are in each shoulder bin
             dS_bin_counts = dS_1s.resample('H',base=0.5).sum()
@@ -130,8 +140,44 @@ def combine_df(target_dfs):
         # shift the timestamps to the middle of the hour sampling period
         df_to_fill.index = df_to_fill.index + pd.Timedelta('30 min')
         
+        total_df = pd.concat([total_df,df_to_fill], join='outer', axis=1)
+        
+        print(cur_df)
+        
+        print(len(total_df))
+        
+    total_df.drop(['dummy'],axis=1,inplace=True)
+    
+    return total_df
+
 
+# =============================================================================
+# 
+# =============================================================================
         
+def depth_from_file(file_in):
+    
+    result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0])
+    
+    return result
+
+# =============================================================================
+# 
+# =============================================================================
+
+netcdfs = glob.glob('*FV01*.nc')
+
+netcdfs = sorted(netcdfs,key=depth_from_file)
+
+df_list = list()
+
+for cur_netcdf in netcdfs:
+    
+    df = netcdf_to_df(cur_netcdf)
+    
+    df_list.append(df)
+    
+
         
         
 
diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py
index 39da3a7..7ec3e4d 100644
--- a/ocean_dp/qc/in_out_water.py
+++ b/ocean_dp/qc/in_out_water.py
@@ -27,89 +27,89 @@
 # flag out of water as QC value 6 (not_deployed), with wise leave as 0
 
 
-def in_out_water(netCDFfile, var_name=None):
+def in_out_water(fn, var_name=None):
 
     out_file = []
 
-    for fn in netCDFfile:
-        ds = Dataset(fn, 'a')
 
-        nc_vars = ds.variables
-        to_add = []
-        if var_name:
-            to_add.append(var_name)
-        else:
-            for v in nc_vars:
-                if "TIME" in nc_vars[v].dimensions:
-                    #print (vars[v].dimensions)
-                    if v != 'TIME':
-                        to_add.append(v)
-            # remove any anx variables from the list
-            for v in nc_vars:
-                if 'ancillary_variables' in nc_vars[v].ncattrs():
-                    remove = nc_vars[v].getncattr('ancillary_variables').split(' ')
-                    print("remove ", remove)
-                    for r in remove:
-                        to_add.remove(r)
-
-        time_var = nc_vars["TIME"]
-        time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
-
-        time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
-        time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
-
-        print('file', fn)
-        print('deployment time', time_deploy)
-
-        print('var to add', to_add)
-
-        # create a mask for the time range
-        mask = (time <= time_deploy) | (time >= time_recovery)
-
-        for v in to_add:
-            print("var", v, ' dimensions ', nc_vars[v].dimensions)
-
-            ncVarOut = nc_vars[v + "_quality_control"]
-            ncVarOut[mask] = 6
-
-            # create a qc variable just for this test flags
-            if v + "_quality_control_io" in ds.variables:
-                ncVarOut = ds.variables[v + "_quality_control_io"]
-                ncVarOut[:] = 0
-            else:
-                ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
-                nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
+    ds = Dataset(fn, 'a')
 
+    nc_vars = ds.variables
+    to_add = []
+    if var_name:
+        to_add.append(var_name)
+    else:
+        for v in nc_vars:
+            if "TIME" in nc_vars[v].dimensions:
+                #print (vars[v].dimensions)
+                if v != 'TIME':
+                    to_add.append(v)
+        # remove any anx variables from the list
+        for v in nc_vars:
+            if 'ancillary_variables' in nc_vars[v].ncattrs():
+                remove = nc_vars[v].getncattr('ancillary_variables').split(' ')
+                print("remove ", remove)
+                for r in remove:
+                    to_add.remove(r)
+
+    time_var = nc_vars["TIME"]
+    time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar)
+
+    time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True)
+    time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True)
+
+    print('file', fn)
+    print('deployment time', time_deploy)
+
+    print('var to add', to_add)
+
+    # create a mask for the time range
+    mask = (time <= time_deploy) | (time >= time_recovery)
+
+    for v in to_add:
+        print("var", v, ' dimensions ', nc_vars[v].dimensions)
+
+        ncVarOut = nc_vars[v + "_quality_control"]
+        ncVarOut[mask] = 6
+
+        # create a qc variable just for this test flags
+        if v + "_quality_control_io" in ds.variables:
+            ncVarOut = ds.variables[v + "_quality_control_io"]
             ncVarOut[:] = 0
-            ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name
-            try:
-                ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag"
-            except AttributeError:
-                pass
-
-            ncVarOut.quality_control_conventions = "IMOS standard flags"
-            ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
-            ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
-            ncVarOut.comment = 'data flagged not deployed (6) when out of water'
-
-            ncVarOut[mask] = 6
-            # calculate the number of points marked as bad_data
-            marked = np.zeros_like(ncVarOut)
-            marked[mask] = 1
-            count = sum(marked)
-
-        ds.file_version = "Level 1 - Quality Controlled Data"
-        # update the history attribute
+        else:
+            ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True)  # fill_value=0 otherwise defaults to max
+            nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io"
+
+        ncVarOut[:] = 0
+        ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name
         try:
-            hist = ds.history + "\n"
+            ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag"
         except AttributeError:
-            hist = ""
+            pass
+
+        ncVarOut.quality_control_conventions = "IMOS standard flags"
+        ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8)
+        ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value'
+        ncVarOut.comment = 'data flagged not deployed (6) when out of water'
+
+        ncVarOut[mask] = 6
+        # calculate the number of points marked as bad_data
+        marked = np.zeros_like(ncVarOut)
+        marked[mask] = 1
+        count = sum(marked)
+
+    ds.file_version = "Level 1 - Quality Controlled Data"
+    # update the history attribute
+    try:
+        hist = ds.history + "\n"
+    except AttributeError:
+        hist = ""
 
-        ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :  ' + ' marked ' + str(int(count)))
+    ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :  ' + ' marked ' + str(int(count)))
 
-        ds.close()
+    ds.close()
 
-        out_file.append(fn)
+    out_file.append(fn)
 
     return out_file
 
diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py
index 90e83e8..6fd7881 100755
--- a/ocean_dp/qc/qc_checker.py
+++ b/ocean_dp/qc/qc_checker.py
@@ -106,6 +106,13 @@ def qc_checker(nc,target_vars_in=[]):
                 
                 # The qc process has succeeded 
                 qc_behaving = True
+                
+                # sets all data with a qc value of 0 to have a qc value of 1, having passed all the tests
+                nc.variables[current_var+"_quality_control"][qc_global_data==0] = 1
+                
+                now=datetime.utcnow()
+                
+                nc.history += ' ' + now.strftime("%Y%m%d:") + 'passed qc_checker, all qc=0 set to qc=1'
     
     # Returns true if qc has succeeded, false if not
     return qc_behaving
diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py
index c389d27..f9aee09 100755
--- a/ocean_dp/sots_processing_runthrough.py
+++ b/ocean_dp/sots_processing_runthrough.py
@@ -62,6 +62,11 @@
 # Run pressure_interpolator.py
 fv01_pres_interp_files = pressure_interpolator.pressure_interpolator(netCDFfiles=fv01_files,agg=glob.glob('*IMOS_ABOS-SOTS*Aggregate*.nc')[0])
 
+# delete the defunct FV01 files
+for ncfile in fv01_files:
+    
+    os.remove(ncfile)
+
 # Global range test
 for ncfile in fv01_pres_interp_files:
     

From 0ed9243197a720947016359c99d967f0a9b88d13 Mon Sep 17 00:00:00 2001
From: bweeding <57697604+bweeding@users.noreply.github.com>
Date: Thu, 25 Jun 2020 14:24:05 +1000
Subject: [PATCH 59/59] Create add_mld.py

---
 ocean_dp/processing/add_mld.py | 101 +++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100755 ocean_dp/processing/add_mld.py

diff --git a/ocean_dp/processing/add_mld.py b/ocean_dp/processing/add_mld.py
new file mode 100755
index 0000000..112ce35
--- /dev/null
+++ b/ocean_dp/processing/add_mld.py
@@ -0,0 +1,101 @@
+# Copyright (C) 2020 Ben Weeding
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from netCDF4 import Dataset, num2date
+import sys
+from datetime import datetime as dt
+import numpy as np
+import pandas as pd
+from scipy import interpolate
+
+def add_mld(nc_in,thresh_in=0.2):
+    
+    # opens the supplied IMOS netcdf
+    nc = Dataset(nc_in,'a')
+    
+    temp_na = np.array(nc.variables['TEMP'])
+
+    # create two nan filled arrays the length of the FV02 file, one for the mld and one for its uncertainty
+    nc_mld = np.full([1,temp_na.shape[1]], np.nan)[0]
+    
+    nc_mld_uncert = np.full([1,temp_na.shape[1]], np.nan)[0]
+    
+    # temp sensor depths
+    nc_temp_depths = np.array(nc.variables['DEPTH_TEMP'])
+    
+    temp_na = temp_na[nc_temp_depths>5,:]
+    
+    nc_temp_depths = nc_temp_depths[nc_temp_depths>5]
+
+    # boolean of sensors at the shallowest depth
+    shallowest_sensors = nc_temp_depths == np.min(nc_temp_depths)
+
+    # for each temperature profile where there is at least one non NaN value in the shallowest sensors
+    for i in np.where(~np.all(np.isnan(temp_na[shallowest_sensors]),axis=0))[0]:
+        
+        # check there is at least one non NaN value in the deeper sensors
+        if np.any(~np.isnan(temp_na[~shallowest_sensors,i])):
+        
+            # calculates the mean temperature of the available shallowest sensors to use as a reference to calculate MLD
+            shallow_temp = np.nanmean(temp_na[shallowest_sensors,i])
+            
+            # extract temperature and depth data using a mean for the shallowest depth, and all non NaN data below
+            profile_temps = np.append(shallow_temp,temp_na[~shallowest_sensors,i][~np.isnan(temp_na[~shallowest_sensors,i])])
+                
+            profile_depths = np.append(nc_temp_depths[0],nc_temp_depths[~shallowest_sensors][~np.isnan(temp_na[~shallowest_sensors,i])])
+                
+            # check if the current profile contains any temperatures outside the specified threshold values
+            if np.any(temp_na[~shallowest_sensors,i]>=shallow_temp+thresh_in) or np.any(temp_na[~shallowest_sensors,i]<=shallow_temp-thresh_in):
+                
+                # generate a linear interpolator for the profile, which returns nan if extrapolation is attempted
+                profile_interp_func = interpolate.interp1d(profile_temps,profile_depths,bounds_error=False,fill_value=np.nan)
+            
+                # finds the shallowest depth at which the linear interpolation of the profile meets a threshold limit
+                nc_mld[i] = np.nanmin(profile_interp_func([shallow_temp+thresh_in,shallow_temp-thresh_in]))
+                
+                # provides an estimate of uncertainty, by giving the distance to the furthest sensor used to interpolate the MLD
+                nc_mld_uncert[i] = np.max([np.abs(nc_mld[i]-[x for x in profile_depths if x < nc_mld[i]][-1]),np.abs(nc_mld[i]-next(x for x in profile_depths if x > nc_mld[i]))])
+            
+            
+            # if none of the sensors are outside the threshold
+            else:
+                
+                # set the mld to the depth of the deepest non NaN sensor
+                nc_mld[i] = np.max(profile_depths)
+
+                # set the uncertainty to the distance between the sensor and the bottom
+                nc_mld_uncert[i] = 4600 - nc_mld[i]
+
+    # create the two variables 
+    mld_var_out = nc.createVariable('MLDx', "f4", ("TIME",), fill_value=np.nan, zlib=True)
+    mld_var_out[:] = nc_mld
+    mld_var_out.units = 'm'
+    mld_var_out.comment = 'Calculated using the linear interpolation MLD algorithm found at: INSERT GITHUB ADDRESS'
+    
+    mld_uncert_var_out = nc.createVariable('MLDx_standard_error', "f4", ("TIME",), fill_value=np.nan, zlib=True)
+    mld_uncert_var_out[:] = nc_mld_uncert
+    mld_uncert_var_out.units = 'm'
+    
+    nc.close()
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
\ No newline at end of file