From aee7d2525c8ff9239361e1a193b1002dda1558de Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 21 Jan 2020 14:59:23 +1100 Subject: [PATCH 01/59] Create addCO2.py --- ocean_dp/processing/addCO2.py | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100755 ocean_dp/processing/addCO2.py diff --git a/ocean_dp/processing/addCO2.py b/ocean_dp/processing/addCO2.py new file mode 100755 index 0000000..6f095be --- /dev/null +++ b/ocean_dp/processing/addCO2.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 21 14:58:20 2020 + +@author: tru050 +""" + + + + +from netCDF4 import Dataset +import sys +import gsw +import numpy as np +from datetime import datetime +import pandas + +# addCO2 takes a SOTS FV02 gridded product netCDFfile as an input, and adds +# CO2 data (delivered from NOAA in a csv file) to the netCDFfile +def addCO2(netCDFfile): + + # Import the SOTS netcdf + ds = Dataset(netCDFfile, 'a') + + # Extract the time variable, in serial date numbers (days since 01/01/1950) + var_time = ds.variables["TIME"] + + # Convert the variable object to an array + netcdf_serials = np.array(var_time[:]) + + # Read in the CO2 csv file, ignoring the first five rows + dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019.csv',header=5) + + # Convert the dataframe to an array + dc = dcsv.to_numpy() + + csv_dates = [] + + # Create a list of datetimes from the csv + for i in range(len(dc)): + + csv_dates.append(datetime.strptime(dc[i,0],'%m/%d/%Y %H:%M')) + + # Calculate the difference between the csv dates and 01/01/1950 in order + # to convert them to the serial date format of the netcdf + time_offset_1950 = datetime(1950,1,1,0,0,0) + + csv_delta= [] + + for i in range(len(dc)): + + csv_delta.append(csv_dates[i] - time_offset_1950) + + + # Convert the datetimes from the csv into an array of serial date numbers + csv_serials = [] + + for i in range(len(dc)): + + csv_serials.append(csv_delta[i].days + csv_delta[i].seconds/86400) + + csv_serials = np.array(csv_serials) + + # Find the indices of timestamps of the csv file that are in the deployment + # period of the netcdf file + matching_index = (netcdf_serials[0] <= csv_serials) & (csv_serials <= netcdf_serials[-1]) + + new_vars = ['XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP'] + + # For each of the variables in the csv file (except time), linearly + # interpolate to the timestamps of the netcdf file + for i in range(0,len(new_vars)): + + np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64')) + + ncVarOut = ds.createVariable(new_vars[i], "f4", ("TIME",), fill_value=np.nan, zlib=True) + + + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added ") + + ds.close() + \ No newline at end of file From fea30ed80bd728b79bfe25c470515ac63db5dc11 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 21 Jan 2020 16:17:41 +1100 Subject: [PATCH 02/59] Update addCO2.py --- ocean_dp/processing/addCO2.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ocean_dp/processing/addCO2.py b/ocean_dp/processing/addCO2.py index 6f095be..8a14af8 100755 --- a/ocean_dp/processing/addCO2.py +++ b/ocean_dp/processing/addCO2.py @@ -30,7 +30,7 @@ def addCO2(netCDFfile): netcdf_serials = np.array(var_time[:]) # Read in the CO2 csv file, ignoring the first five rows - dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019.csv',header=5) + dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019test.csv',header=5) # Convert the dataframe to an array dc = dcsv.to_numpy() @@ -68,15 +68,19 @@ def addCO2(netCDFfile): new_vars = ['XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP'] + new_units = ['kPa','umol/mol','umol/mol','Presumed PSU - not specified','deg C'] + # For each of the variables in the csv file (except time), linearly # interpolate to the timestamps of the netcdf file for i in range(0,len(new_vars)): - - np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64')) ncVarOut = ds.createVariable(new_vars[i], "f4", ("TIME",), fill_value=np.nan, zlib=True) + ncVarOut[:] = np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64')) + + ncVarOut.units = new_units[i] + ncVarOut.comment = "imported from 'SOFS_prelimdata_Nov2019.csv'" # update the history attribute try: @@ -84,7 +88,7 @@ def addCO2(netCDFfile): except AttributeError: hist = "" - ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added ") + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added 'XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP' from 'SOFS_prelimdata_Nov2019.csv'") ds.close() \ No newline at end of file From 71470a317c942331e61977bad5f2e30b93759b73 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 22 Jan 2020 09:55:49 +1100 Subject: [PATCH 03/59] Update copyDataset.py --- ocean_dp/aggregation/copyDataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py index 0079d13..154757f 100644 --- a/ocean_dp/aggregation/copyDataset.py +++ b/ocean_dp/aggregation/copyDataset.py @@ -244,8 +244,8 @@ def aggregate(files, varNames): filen = 0 - # variables we want regardless - varNames += ['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'] + # variables we want regardless + varNames = [varNames]+['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'] # remove any duplicates varNamesOut = set(varNames) From 7145c3ebd80702ece428d2d6d07afc53c166371b Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 22 Jan 2020 15:08:31 +1100 Subject: [PATCH 04/59] Create add_interp_press.m --- matlab/add_interp_press.m | 125 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 matlab/add_interp_press.m diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m new file mode 100644 index 0000000..6b139ce --- /dev/null +++ b/matlab/add_interp_press.m @@ -0,0 +1,125 @@ +% SOTS Pressure interpolator + +% This code imports pressure data from an aggregated file (constructed by +% P.Jansen), and creates interpolated pressure records for FV00 raw +% instrument files - firstly by interpolating along time series of pressure +% readings in the aggregate file to find pressures at each time in a +% particular FV00 file, and secondly by interpolating down nominal depths at +% each timestamp to find a pressure value for each FV00 timestamp. + +% Ben Weeding - ben.weeding.26@gmail.com + +%% Load the filenames + +fv00_files = dir('*FV00*.nc'); +agg_files = dir('*Aggregate*.nc'); + +%% Load the pressure data + +agg_pres = ncread(agg_files.name,'PRES'); +agg_pres_info = ncinfo(agg_files.name, 'PRES'); +agg_instrument_index = ncread(agg_files.name,'instrument_index'); +agg_nominal_depth = ncread(agg_files.name,'NOMINAL_DEPTH'); +agg_time = ncread(agg_files.name,'TIME'); + +%% Create a scattered interpolant from the aggregate data + +% This was an error, as it interpolated in 2D space rather than twice in 1D + + +% Subsampled every 10 points for speed of execution at this point +%scat_interp_pres = scatteredInterpolant(agg_time(1:10:end),agg_nominal_depth(agg_instrument_index(1:10:end)+1),agg_pres(1:10:end)); + +%% Interpolate the pressure and write the data into the FV00 file + +% Loop through each of the fv00 files +for i=1:length(fv00_files) + + disp(fv00_files(i).name) + + % Extract the content from the FV00 file + fv00_contents = ncinfo(fv00_files(i).name); + + % Check if the FV00 file contains pressure data, run the interpolation + % code if not + + if (sum(contains({fv00_contents.Variables(:).Name}, 'PRES')) == 0) + + % Load the FV00 data requiring pressure + %'days since 1950-01-01 00:00:00 UTC' for minilog T + fv00_time = ncread(fv00_files(i).name,'TIME'); + fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH'); + + % Interpolate the agg pressure records at each nominal depth to + % provide pressure values at each timestamp in the current FV00 + % file + + interp_agg_pres = nan(length(agg_nominal_depth),length(fv00_time)); + + % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time + for j = 1:length(agg_nominal_depth) + + % Select the relevant time and pressures + + time_selection = agg_time(agg_instrument_index == (j-1)); + pres_selection = agg_pres(agg_instrument_index == (j-1)); + + % Interpolate along each nominal depth + + interp_agg_pres(j,:) = interp1(time_selection,pres_selection,fv00_time); + end + + % At each timestamp in the FV00 record, interpolate a pressure + % value based on the FV00 nominal depth, and the interpolated + % pressures in interp_agg_pres. Using spline interpolation here to + % deal with NaN values that occur in failed pressure sensors. + pres_interp_dummy = nan(size(fv00_time)); + + for l = 1:length(fv00_time) + + pres_interp_dummy(l) = interp1(agg_nominal_depth,interp_agg_pres(:,l),fv00_depth,'spline'); + + end + + pres_interp = pres_interp_dummy; + + % Create an FV01 version of the current FV00 file + + % Create the new FV01 file name + + fv01_name = strrep(fv00_files(i).name,'FV00','FV01'); + fv01_name(end-10:end-3)=datestr(now,'yyyymmdd'); + + % Write the FV00 data into the FV01 file + ncwriteschema(fv01_name, fv00_contents); + + % copy variable data to new file + for v = fv00_contents.Variables + ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name)); + end + + % Modify the global attributes of the file to record processing, + % and add to the file history + + ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed'); + hist = ncreadatt(fv00_files(i).name, '/', 'history'); + ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Added interpolated pressure from ' agg_files.name]); + + % Add and populate a PRES variable to the FV01 file + nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(pres_interp,1)}, 'FillValue',NaN); + ncwrite(fv01_name, 'PRES', pres_interp); + + % copy attributes from agg file to output file + pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file + for k=1:length(pres_atts) + if (strcmp(pres_atts(k).Name, '_FillValue') == 0) + ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value); + end + end + + % Add the relevant attributes to the PRES variable, including a + % comment noting that the data has been linearly interpolated + ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been linearly interpolated from surrounding pressure sensors'); + + end +end \ No newline at end of file From 8d81ba9c925d5f6fda53d3e14bb1161cefd6ba21 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 30 Jan 2020 12:28:58 +1100 Subject: [PATCH 05/59] Update add_interp_press.m --- matlab/add_interp_press.m | 158 ++++++++++++++++++++++++++++++++++---- 1 file changed, 142 insertions(+), 16 deletions(-) diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m index 6b139ce..5e5ff5a 100644 --- a/matlab/add_interp_press.m +++ b/matlab/add_interp_press.m @@ -22,13 +22,16 @@ agg_nominal_depth = ncread(agg_files.name,'NOMINAL_DEPTH'); agg_time = ncread(agg_files.name,'TIME'); -%% Create a scattered interpolant from the aggregate data - -% This was an error, as it interpolated in 2D space rather than twice in 1D +% Here we prevent the use of bad data from Pulse 8 +if strfind(fv00_files(1).name,'Pulse-8') + + agg_pres(agg_instrument_index==2 & agg_time+datenum(1950,1,1,0,0,0) >= datenum('30-01-2012 05:00','dd-mm-yyyy HH:MM'))=NaN; + + %agg_pres(agg_instrument_index==2)=NaN; + +end -% Subsampled every 10 points for speed of execution at this point -%scat_interp_pres = scatteredInterpolant(agg_time(1:10:end),agg_nominal_depth(agg_instrument_index(1:10:end)+1),agg_pres(1:10:end)); %% Interpolate the pressure and write the data into the FV00 file @@ -54,10 +57,16 @@ % provide pressure values at each timestamp in the current FV00 % file - interp_agg_pres = nan(length(agg_nominal_depth),length(fv00_time)); + interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time)); + + % Include a row of zeros to set surface depth as 0 dbar + + interp_agg_pres(1,:) = zeros(size(fv00_time)); + + agg_nominal_depth_with_0 = [0; agg_nominal_depth]; % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time - for j = 1:length(agg_nominal_depth) + for j = 1:(length(agg_nominal_depth)) % Select the relevant time and pressures @@ -66,20 +75,36 @@ % Interpolate along each nominal depth - interp_agg_pres(j,:) = interp1(time_selection,pres_selection,fv00_time); - end + interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time); + end + + % Sort the nominal depths and pressures + + [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0); + + interp_agg_pres = interp_agg_pres(sort_idx,:); + + + % Linearly interpolate at each timestamp to replace NaN values + + interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0); + % At each timestamp in the FV00 record, interpolate a pressure % value based on the FV00 nominal depth, and the interpolated - % pressures in interp_agg_pres. Using spline interpolation here to - % deal with NaN values that occur in failed pressure sensors. - pres_interp_dummy = nan(size(fv00_time)); + % pressures in interp_agg_pres. + pres_interp_dummy = nan(size(fv00_time)); - for l = 1:length(fv00_time) + + for l = 1:length(fv00_time) + + if sum(~isnan(interp_agg_pres(:,l))) > 1 - pres_interp_dummy(l) = interp1(agg_nominal_depth,interp_agg_pres(:,l),fv00_depth,'spline'); + pres_interp_dummy(l) = interp1(agg_nominal_depth_with_0,interp_agg_pres(:,l),fv00_depth); - end + end + + end pres_interp = pres_interp_dummy; @@ -119,7 +144,108 @@ % Add the relevant attributes to the PRES variable, including a % comment noting that the data has been linearly interpolated - ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been linearly interpolated from surrounding pressure sensors'); + ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been interpolated from surrounding pressure sensors'); + + else + + % Load the FV00 data containing pressure + %'days since 1950-01-01 00:00:00 UTC' for minilog T + fv00_time = ncread(fv00_files(i).name,'TIME'); + fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH'); + fv00_pres = ncread(fv00_files(i).name,'PRES'); + + % Remove bad data in pulse 8 + + if strfind(fv00_files(i).name,'Pulse-8-2011-SBE16plusV2-01606330-34m') + + fv00_pres(4442:end) = NaN; + + end + + % Interpolate the agg pressure records at each nominal depth to + % provide pressure values at each timestamp in the current FV00 + % file + + interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time)); + + % Include a row of zeros to set surface depth as 0 dbar + + interp_agg_pres(1,:) = zeros(size(fv00_time)); + + agg_nominal_depth_with_0 = [0; agg_nominal_depth]; + + % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time + for j = 1:(length(agg_nominal_depth)) + + % Select the relevant time and pressures + + time_selection = agg_time(agg_instrument_index == (j-1)); + pres_selection = agg_pres(agg_instrument_index == (j-1)); + + % Interpolate along each nominal depth + + interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time); + end + + % Sort the nominal depths and pressures + + [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0); + + interp_agg_pres = interp_agg_pres(sort_idx,:); + + + % Linearly interpolate at each timestamp to replace NaN values + + interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0); + + for j = 1:length(fv00_pres) + + if isnan(fv00_pres(j)) + + fv00_pres(j) = interp_agg_pres(agg_nominal_depth_with_0==fv00_depth,j); + + end + + end + + % Create an FV01 version of the current FV00 file + + % Create the new FV01 file name + + fv01_name = strrep(fv00_files(i).name,'FV00','FV01'); + fv01_name(end-10:end-3)=datestr(now,'yyyymmdd'); + + % Write the FV00 data into the FV01 file + ncwriteschema(fv01_name, fv00_contents); + + % copy variable data to new file + for v = fv00_contents.Variables + ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name)); + end + + % Modify the global attributes of the file to record processing, + % and add to the file history + + ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed'); + hist = ncreadatt(fv00_files(i).name, '/', 'history'); + ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Filled missing pressure with interpolated pressure from ' agg_files.name]); + + % Add and populate a PRES variable to the FV01 file + %nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(fv00_pres,1)}, 'FillValue',NaN); + ncwrite(fv01_name, 'PRES', fv00_pres); + + % copy attributes from agg file to output file + pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file + for k=1:length(pres_atts) + if (strcmp(pres_atts(k).Name, '_FillValue') == 0) + ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value); + end + end + + % Add the relevant attributes to the PRES variable, including a + % comment noting that the data has been linearly interpolated + ncwriteatt(fv01_name, 'PRES', 'comment','originally missing pressure data has been interpolated from surrounding pressure sensors'); + end end \ No newline at end of file From 3e0f9a960df28430a7f0670ccfbd00fc3cb0d776 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 5 Feb 2020 14:45:48 +1100 Subject: [PATCH 06/59] Create pressure_interpolator.py --- ocean_dp/processing/pressure_interpolator.py | 115 +++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100755 ocean_dp/processing/pressure_interpolator.py diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py new file mode 100755 index 0000000..311d03d --- /dev/null +++ b/ocean_dp/processing/pressure_interpolator.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 4 11:05:16 2020 + +@author: tru050 +""" + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import numpy as np +import sys +from netCDF4 import Dataset +import numpy +import argparse +import glob +import pandas as pd +import scipy + +def pressure_interpolator: + + # Load the filenames of the FV00 files in the current folder + fv00_files = glob.glob('*FV00*.nc'); + + # Extract the aggregate file data + agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r") + + # Loop through each of the FV00 files + for i in fv00_files: + + # Extract the contents of the current file + fv00_contents = Dataset(i, mode="r") + + # Check the current file doesn't contain pressure to run the following + # interpolator + if not 'PRES' in fv00_contents.variables: + + # Create a NaN array to fill with pressure values + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan) + + # Set the first row as zeros to set 0m as 0dbar + interp_agg_pres[0,:] = 0 + + # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), + # only for interpolation in cases where the deepest sensor has failed + #interp_agg_pres[-1,:] = 5000 + + # Create a new array representing the nominal depths of the agg file, + # including the 0m values + agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) + + # For each nominal depth, interpolate the agg data at the FV00 times + for j in range(1,len(agg_nominal_depths)): + + time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] + + pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] + + interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection) + + # Sort the nominal depths and pressures according to nominal depth + interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] + + agg_nominal_depths.sort() + + # If there are any NaN values, linearly interpolate profilewise + if np.isnan(np.sum(interp_agg_pres)): + + # Make a dataframe of the interpolated pressure to handle NaNs easily + interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) + + # Find all the columns where the lowest element is NaN + nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist() + + # Select each column containing an NaN as the deepest value + for j in nan_cols: + + # Find the shallowest nominal depth that isn't NaN + shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) + + # Find the index of that nominal depth + shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) + + # Starting at the shallowest NaN in a continous block of NaNs to the bottom + for k in range(shallowest_idx+1,len(interp_agg_pres_df)): + + # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar + interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] + + # Linearly interpolate any remaining NaNs + interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") + + # Convert the DataFrame back to an array + interp_agg_pres = interp_agg_pres_df.to_numpy() + + # Create a NaN array to receive the FV00 interpolated pressures + interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan) + + # At each timestamp, interpolate pressure for the FV00 data + for j in range(len(fv00_contents.variables["TIME"])): + + interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j]) + + + + + + + + + + From dde748f5b90ff75b8f6874584a07e03b294fd678 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 5 Feb 2020 14:45:56 +1100 Subject: [PATCH 07/59] Update add_interp_press.m --- matlab/add_interp_press.m | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m index 5e5ff5a..5ba9817 100644 --- a/matlab/add_interp_press.m +++ b/matlab/add_interp_press.m @@ -134,6 +134,30 @@ nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(pres_interp,1)}, 'FillValue',NaN); ncwrite(fv01_name, 'PRES', pres_interp); + % Add quality control variables to the FV01 file, assigning 8 to + % interpolated data in line with Argo + for v = fv00_contents.Variables + + if ~isempty(v.Dimensions) + + nccreate(fv01_name, v.Name + "_quality_control",'Dimensions',{v.Dimensions.Name,v.Dimensions.Length},'FillValue',99); + + ncwriteatt(fv01_name,v.Name + "_quality_control",'long_name',"quality_code for"+v.Name); + + ncwriteatt(fv01_name,v.Name,'ancillary_variables',v.Name + "_quality_control"); + + if contains(v.Name,'PRES') + + ncwrite(fv01_name, v.Name + "_quality_control",8*ones(size(fv00_time))); + + end + + end + + end + + + % copy attributes from agg file to output file pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file for k=1:length(pres_atts) From b7afd52237c021633a0992603e9057ae2f6a930c Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 5 Feb 2020 14:46:00 +1100 Subject: [PATCH 08/59] Update copyDataset.py --- ocean_dp/aggregation/copyDataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py index 154757f..d23fe7d 100644 --- a/ocean_dp/aggregation/copyDataset.py +++ b/ocean_dp/aggregation/copyDataset.py @@ -15,7 +15,6 @@ # similar more general tool project https://ncagg.readthedocs.io/en/latest/ (does not work on python3 2019-10-01) # has configurable way of dealing with attributes - # file sets to test against # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Temperature/catalog.html # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html @@ -344,7 +343,7 @@ def aggregate(files, varNames): dMin = maVariableAll.max(0) ncOut.setncattr("geospatial_vertical_max", dMax) ncOut.setncattr("geospatial_vertical_min", dMin) - + dsIn.close() # we're done with the varList now ncOut.close() From a9c85a9d413dccd7ddfb651efab7ce1a68656d1f Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 5 Feb 2020 14:46:02 +1100 Subject: [PATCH 09/59] Create flatline_test.py --- ocean_dp/qc/flatline_test.py | 153 +++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100755 ocean_dp/qc/flatline_test.py diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py new file mode 100755 index 0000000..a4e4ec5 --- /dev/null +++ b/ocean_dp/qc/flatline_test.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 3 14:10:41 2020 + +@author: tru050 +""" + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +#!/usr/bin/python3 + +# add_qc_flags +# Copyright (C) 2020 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# add QC variables to file + + +def add_qc(netCDFfile): + + new_name = [] # list of new file names + + # loop over all file names given + for fn in netCDFfile[1:]: + ds = Dataset(fn, 'a') + + # read the variable names from the netCDF dataset + vars = ds.variables + + # create a list of variables, don't include the 'TIME' variable + # TODO: detect 'TIME' variable using the standard name 'time' + to_add = [] + for v in vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + # for each variable, add a new ancillary variable _quality_control to each which has 'TIME' as a dimension + for v in to_add: + if "TIME" in vars[v].dimensions: + # print("time dim ", v) + + ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 + ncVarOut[:] = np.zeros(vars[v].shape) + ncVarOut.long_name = "quality_code for " + v + + vars[v].ancillary_variables = v + "_quality_control" + + # update the file version attribute + ds.file_version = "Level 1 - Quality Controlled Data" + + ds.close() + + # rename the file FV00 to FV01 (imos specific) + fn_new = fn.replace("FV00", "FV01") + new_name.append(fn_new) + + if fn_new != fn: + # copy file + os.copy(fn, fn_new) + + print(fn_new) + + return new_name + + +if __name__ == "__main__": + add_qc(sys.argv) + +############################################################################## + +def flatline_test(*target_files,target_vars=[],window=3): + + # If files aren't specified, take all the .nc files in the current folder + if not target_files: + + target_files = glob.glob('*.nc') + + # Loop through each files in target_files + for current_file in target_files: + + + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="r") + + # Extract time + ncTime = nc.get_variables_by_attributes(standard_name='time') + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing TIME + if target_vars == []: + + target_vars = list(nc.variables.keys()) + + target_vars.remove('TIME') + + # Check if file contains quality control variables, and if not create + + if not any("_quality_control" in i for i in target_vars: + + # insert _quality_control variables into file? + # should this be done now, or should we assume it + # will have already been done? + + + # For each variable, extract the data + for current_var in target_vars: + + var_data = np.array(nc.variables[current_var]) + + for i in 0:(len(var_data)-window+1): + + # This is true if 'window' elements in a row are equal + if len(set(var_data[i:(i+window)])) == 1 + + # set corresponding QC value to... + + + + + + + + + + \ No newline at end of file From 1193875bda18d662cef5ca5bccf42f48f65032f1 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 12 Feb 2020 16:03:17 +1100 Subject: [PATCH 10/59] Selects deployed data --- ocean_dp/processing/pressure_interpolator.py | 75 +++++++++++- ocean_dp/qc/add_qc_flags.py | 61 +++++++--- ocean_dp/qc/select_in_water.py | 114 +++++++++++++++++++ 3 files changed, 227 insertions(+), 23 deletions(-) create mode 100755 ocean_dp/qc/select_in_water.py diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py index 311d03d..cb929af 100755 --- a/ocean_dp/processing/pressure_interpolator.py +++ b/ocean_dp/processing/pressure_interpolator.py @@ -19,6 +19,8 @@ import glob import pandas as pd import scipy +import os +import shutil def pressure_interpolator: @@ -44,10 +46,6 @@ def pressure_interpolator: # Set the first row as zeros to set 0m as 0dbar interp_agg_pres[0,:] = 0 - # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), - # only for interpolation in cases where the deepest sensor has failed - #interp_agg_pres[-1,:] = 5000 - # Create a new array representing the nominal depths of the agg file, # including the 0m values agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) @@ -103,9 +101,76 @@ def pressure_interpolator: for j in range(len(fv00_contents.variables["TIME"])): interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j]) - + + # Use methods from add_qc_flags to make a new netcdf? + + # Deal with files that already contain pressure, but may contain NaNs + else: + # Create a NaN array to fill with pressure values + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan) + + # Set the first row as zeros to set 0m as 0dbar + interp_agg_pres[0,:] = 0 + + # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), + # only for interpolation in cases where the deepest sensor has failed + #interp_agg_pres[-1,:] = 5000 + + # Create a new array representing the nominal depths of the agg file, + # including the 0m values + agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) + + # For each nominal depth, interpolate the agg data at the FV00 times + for j in range(1,len(agg_nominal_depths)): + + time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] + + pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] + + interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection) + + # Sort the nominal depths and pressures according to nominal depth + interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] + + agg_nominal_depths.sort() + # If there are any NaN values, linearly interpolate profilewise + if np.isnan(np.sum(interp_agg_pres)): + + # Make a dataframe of the interpolated pressure to handle NaNs easily + interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) + + # Find all the columns where the lowest element is NaN + nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist() + + # Select each column containing an NaN as the deepest value + for j in nan_cols: + + # Find the shallowest nominal depth that isn't NaN + shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) + + # Find the index of that nominal depth + shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) + # Starting at the shallowest NaN in a continous block of NaNs to the bottom + for k in range(shallowest_idx+1,len(interp_agg_pres_df)): + + # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar + interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] + + # Linearly interpolate any remaining NaNs + interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") + + # Convert the DataFrame back to an array + interp_agg_pres = interp_agg_pres_df.to_numpy() + + # Create a NaN array to receive the FV00 interpolated pressures + interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan) + + # Extract the interpolated pressures (NaNs removed) to store in netCDF4 + interp_fv00_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv00_contents.variables["NOMINAL_DEPTH"][:]] + + # diff --git a/ocean_dp/qc/add_qc_flags.py b/ocean_dp/qc/add_qc_flags.py index 7d00c63..bd68234 100644 --- a/ocean_dp/qc/add_qc_flags.py +++ b/ocean_dp/qc/add_qc_flags.py @@ -18,11 +18,12 @@ from netCDF4 import Dataset, num2date import sys - +from datetime import datetime import numpy as np from dateutil import parser import pytz import os +import shutil # add QC variables to file @@ -32,8 +33,31 @@ def add_qc(netCDFfile): new_name = [] # list of new file names # loop over all file names given - for fn in netCDFfile[1:]: - ds = Dataset(fn, 'a') + for fn in netCDFfile: + + # rename the file FV00 to FV01 (imos specific) + fn_new = fn.replace("FV00", "FV01") + + # Change the creation date in the filename to today + now=datetime.utcnow() + + + + fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) + + # Add the new file name to the list of new file names + new_name.append(fn_new) + + # If a new (different) filename has been successfully generated, make + # a copy of the old file with the new name + if fn_new != fn: + # copy file + shutil.copy(fn, fn_new) + + + print(fn_new) + + ds = Dataset(fn_new, 'a') # read the variable names from the netCDF dataset vars = ds.variables @@ -51,29 +75,30 @@ def add_qc(netCDFfile): if "TIME" in vars[v].dimensions: # print("time dim ", v) - ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 - ncVarOut[:] = np.zeros(vars[v].shape) - ncVarOut.long_name = "quality_code for " + v + if v+"_quality_control" not in ds.variables: + ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 + ncVarOut[:] = np.zeros(vars[v].shape) + ncVarOut.long_name = "quality_code for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + - vars[v].ancillary_variables = v + "_quality_control" + vars[v].ancillary_variables = v + "_quality_control" - # update the file version attribute + # update the global attributes ds.file_version = "Level 1 - Quality Controlled Data" + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + ds.history += ' ' + now.strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.' - ds.close() - - # rename the file FV00 to FV01 (imos specific) - fn_new = fn.replace("FV00", "FV01") - new_name.append(fn_new) + # ADD quality control attributes!! - if fn_new != fn: - # copy file - os.copy(fn, fn_new) + ds.close() - print(fn_new) return new_name if __name__ == "__main__": - add_qc(sys.argv) + add_qc(sys.argv[1:]) \ No newline at end of file diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py new file mode 100755 index 0000000..b600695 --- /dev/null +++ b/ocean_dp/qc/select_in_water.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 12 09:49:26 2020 + +@author: tru050 +""" +from dateutil.parser import parse +from netCDF4 import Dataset, num2date, date2num +from datetime import datetime, timedelta +import sys +from datetime import datetime +import numpy as np +from dateutil import parser +import pytz +import os +import shutil + +def select_in_water(netCDFfile): + + new_name = [] # list of new file names + + # loop over all file names given + for fn in netCDFfile: + + # Change the creation date in the filename to today + now=datetime.utcnow() + + fn_new = fn.replace("FV00", "FV01") + + fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) + + # Add the new file name to the list of new file names + new_name.append(fn_new) + + # Load the original netcdf file + ods = Dataset(fn,'a') + + # Extract the time dimension, and the deployment start and end + time = np.array(ods.variables['TIME'][:]) + + inw = parse(ods.time_deployment_start) + + outw = parse(ods.time_deployment_end) + + # Convert the start and end to the number format used in TIME + inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) + + outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) + + # Create logical index of deployed times + + deployed = np.logical_and(time>=inw_num,time<=outw_num) + + # Determine the length of the new time dimension + + time_dim = len(time[deployed]) + + # Create the new netcdf file + ds = Dataset(fn_new, "w", format="NETCDF4") + + TIME = ds.createDimension("TIME",time_dim) + + # Copy global attributes + + for att in ods.ncattrs(): + + ds.setncattr(att,ods.getncattr(att)) + + # Copy variables + + for v_name, varin in ods.variables.items(): + + varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) + + # Copy variable attributes + varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) + + # Fill variables with deployed data + + if np.array(varin[:]).size == 1: + + varout[:] = varin[:] + + else: + + varout[:] = np.array(varin[:])[deployed] + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + + ds.close() + + ods.close() + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 774f7dabd15ba1cbdff4e26fe72bb4b3a02fc18a Mon Sep 17 00:00:00 2001 From: Peter Jansen Date: Wed, 12 Feb 2020 16:30:46 +1100 Subject: [PATCH 11/59] Update select_in_water.py minor clean ups, should look at using split for file name manipulation. --- ocean_dp/qc/select_in_water.py | 85 ++++++++++++++++------------------ 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index b600695..f23df57 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -1,10 +1,21 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 12 09:49:26 2020 +#!/usr/bin/python3 + +# ocean_dp +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . -@author: tru050 -""" from dateutil.parser import parse from netCDF4 import Dataset, num2date, date2num from datetime import datetime, timedelta @@ -16,19 +27,19 @@ import os import shutil -def select_in_water(netCDFfile): + +def select_in_water(netCDFfiles): new_name = [] # list of new file names # loop over all file names given - for fn in netCDFfile: + for fn in netCDFfiles: # Change the creation date in the filename to today now=datetime.utcnow() - fn_new = fn.replace("FV00", "FV01") - - fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) + fn_new = fn.replace("FV00", "FV01") + fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) # might be better to use split("-") here, and manybe even a check for IMOS file name # Add the new file name to the list of new file names new_name.append(fn_new) @@ -40,20 +51,16 @@ def select_in_water(netCDFfile): time = np.array(ods.variables['TIME'][:]) inw = parse(ods.time_deployment_start) - outw = parse(ods.time_deployment_end) # Convert the start and end to the number format used in TIME inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) - outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) - # Create logical index of deployed times - + # Create logical index of deployed times deployed = np.logical_and(time>=inw_num,time<=outw_num) - # Determine the length of the new time dimension - + # Determine the length of the new time dimension time_dim = len(time[deployed]) # Create the new netcdf file @@ -61,14 +68,12 @@ def select_in_water(netCDFfile): TIME = ds.createDimension("TIME",time_dim) - # Copy global attributes - + # Copy global attributes for att in ods.ncattrs(): ds.setncattr(att,ods.getncattr(att)) - # Copy variables - + # Copy variables for v_name, varin in ods.variables.items(): varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) @@ -76,8 +81,7 @@ def select_in_water(netCDFfile): # Copy variable attributes varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) - # Fill variables with deployed data - + # Fill variables with deployed data if np.array(varin[:]).size == 1: varout[:] = varin[:] @@ -86,29 +90,18 @@ def select_in_water(netCDFfile): varout[:] = np.array(varin[:])[deployed] - ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") - ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' - - ds.close() + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + ds.close() ods.close() - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + +if __name__ == "__main__": + select_in_water(sys.argv[1:]) From 40e89b2f800d0fd7c5d2241b78140316fdf9133f Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 13 Feb 2020 15:39:04 +1100 Subject: [PATCH 12/59] Update select_in_water.py --- ocean_dp/qc/select_in_water.py | 122 ++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 57 deletions(-) diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index f23df57..6d5b7fb 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -35,72 +35,80 @@ def select_in_water(netCDFfiles): # loop over all file names given for fn in netCDFfiles: - # Change the creation date in the filename to today - now=datetime.utcnow() + # Check the file is an IMOS formatted file + if fn.split('_')[0]=='IMOS' - fn_new = fn.replace("FV00", "FV01") - fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) # might be better to use split("-") here, and manybe even a check for IMOS file name - - # Add the new file name to the list of new file names - new_name.append(fn_new) + # Change the creation date in the filename to today + now=datetime.utcnow() - # Load the original netcdf file - ods = Dataset(fn,'a') - - # Extract the time dimension, and the deployment start and end - time = np.array(ods.variables['TIME'][:]) - - inw = parse(ods.time_deployment_start) - outw = parse(ods.time_deployment_end) - - # Convert the start and end to the number format used in TIME - inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) - outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) - - # Create logical index of deployed times - deployed = np.logical_and(time>=inw_num,time<=outw_num) - - # Determine the length of the new time dimension - time_dim = len(time[deployed]) - - # Create the new netcdf file - ds = Dataset(fn_new, "w", format="NETCDF4") - - TIME = ds.createDimension("TIME",time_dim) - - # Copy global attributes - for att in ods.ncattrs(): + fn_new = fn.replace("FV00", "FV01") - ds.setncattr(att,ods.getncattr(att)) - - # Copy variables - for v_name, varin in ods.variables.items(): + fn_new_split = fn_new.split('_') - varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) - - # Copy variable attributes - varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) - - # Fill variables with deployed data - if np.array(varin[:]).size == 1: - - varout[:] = varin[:] + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + + fn_new = '_'.join(fn_new_split) + + # Add the new file name to the list of new file names + new_name.append(fn_new) - else: + # Load the original netcdf file + ods = Dataset(fn,'a') + + # Extract the time dimension, and the deployment start and end + time = np.array(ods.variables['TIME'][:]) + + inw = parse(ods.time_deployment_start) + outw = parse(ods.time_deployment_end) + + # Convert the start and end to the number format used in TIME + inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) + outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) + + # Create logical index of deployed times + deployed = np.logical_and(time>=inw_num,time<=outw_num) + + # Determine the length of the new time dimension + time_dim = len(time[deployed]) + + # Create the new netcdf file + ds = Dataset(fn_new, "w", format="NETCDF4") + + TIME = ds.createDimension("TIME",time_dim) + + # Copy global attributes + for att in ods.ncattrs(): - varout[:] = np.array(varin[:])[deployed] + ds.setncattr(att,ods.getncattr(att)) + + # Copy variables + for v_name, varin in ods.variables.items(): - ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) - # update the history attribute - try: - hist = ds.history + "\n" - except AttributeError: - hist = "" - ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + # Copy variable attributes + varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) - ds.close() - ods.close() + # Fill variables with deployed data + if np.array(varin[:]).size == 1: + + varout[:] = varin[:] + + else: + + varout[:] = np.array(varin[:])[deployed] + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + + ds.close() + ods.close() if __name__ == "__main__": From 1e2f36b5818d4f6c97e4c15af5c28baa667a452d Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 13 Feb 2020 15:39:28 +1100 Subject: [PATCH 13/59] Update in_water with split and IMOS test --- ocean_dp/qc/select_in_water.py | 171 ++++++++++++++++----------------- ocean_dp/qc/spike_test | 8 ++ 2 files changed, 93 insertions(+), 86 deletions(-) create mode 100755 ocean_dp/qc/spike_test diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index b600695..6925d12 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -1,10 +1,19 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 12 09:49:26 2020 +# ocean_dp +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . -@author: tru050 -""" from dateutil.parser import parse from netCDF4 import Dataset, num2date, date2num from datetime import datetime, timedelta @@ -16,99 +25,89 @@ import os import shutil -def select_in_water(netCDFfile): + +def select_in_water(netCDFfiles): new_name = [] # list of new file names # loop over all file names given - for fn in netCDFfile: - - # Change the creation date in the filename to today - now=datetime.utcnow() + for fn in netCDFfiles: - fn_new = fn.replace("FV00", "FV01") + # Check the file is an IMOS formatted file + if fn.split('_')[0]=='IMOS' - fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) - - # Add the new file name to the list of new file names - new_name.append(fn_new) + # Change the creation date in the filename to today + now=datetime.utcnow() - # Load the original netcdf file - ods = Dataset(fn,'a') - - # Extract the time dimension, and the deployment start and end - time = np.array(ods.variables['TIME'][:]) - - inw = parse(ods.time_deployment_start) - - outw = parse(ods.time_deployment_end) - - # Convert the start and end to the number format used in TIME - inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) - - outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) - - # Create logical index of deployed times - - deployed = np.logical_and(time>=inw_num,time<=outw_num) - - # Determine the length of the new time dimension - - time_dim = len(time[deployed]) - - # Create the new netcdf file - ds = Dataset(fn_new, "w", format="NETCDF4") - - TIME = ds.createDimension("TIME",time_dim) - - # Copy global attributes - - for att in ods.ncattrs(): + fn_new = fn.replace("FV00", "FV01") - ds.setncattr(att,ods.getncattr(att)) - - # Copy variables + fn_new_split = fn_new.split('_') - for v_name, varin in ods.variables.items(): + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") - varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) - - # Copy variable attributes - varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) - - # Fill variables with deployed data + fn_new = '_'.join(fn_new_split) - if np.array(varin[:]).size == 1: + # Add the new file name to the list of new file names + new_name.append(fn_new) - varout[:] = varin[:] - - else: - - varout[:] = np.array(varin[:])[deployed] + # Load the original netcdf file + ods = Dataset(fn,'a') + + # Extract the time dimension, and the deployment start and end + time = np.array(ods.variables['TIME'][:]) + + inw = parse(ods.time_deployment_start) + outw = parse(ods.time_deployment_end) + + # Convert the start and end to the number format used in TIME + inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) + outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) + + # Create logical index of deployed times + deployed = np.logical_and(time>=inw_num,time<=outw_num) + + # Determine the length of the new time dimension + time_dim = len(time[deployed]) + + # Create the new netcdf file + ds = Dataset(fn_new, "w", format="NETCDF4") + + TIME = ds.createDimension("TIME",time_dim) + + # Copy global attributes + for att in ods.ncattrs(): - ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") - - ds.history += ' ' + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' - - ds.close() - - ods.close() - - - - - - - - - - - - - - - + ds.setncattr(att,ods.getncattr(att)) + # Copy variables + for v_name, varin in ods.variables.items(): + + varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) + + # Copy variable attributes + varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) + + # Fill variables with deployed data + if np.array(varin[:]).size == 1: + + varout[:] = varin[:] + + else: + + varout[:] = np.array(varin[:])[deployed] + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' - \ No newline at end of file + ds.close() + ods.close() + + +if __name__ == "__main__": + select_in_water(sys.argv[1:]) \ No newline at end of file diff --git a/ocean_dp/qc/spike_test b/ocean_dp/qc/spike_test new file mode 100755 index 0000000..0bb13ea --- /dev/null +++ b/ocean_dp/qc/spike_test @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 12 16:28:38 2020 + +@author: tru050 +""" + From af33590364f1b1d6a98e90e75e43f2cae9b79378 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 14 Feb 2020 14:41:11 +1100 Subject: [PATCH 14/59] Create netcdf_gen.py --- ocean_dp/qc/netcdf_gen.py | 118 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 ocean_dp/qc/netcdf_gen.py diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py new file mode 100755 index 0000000..f687f83 --- /dev/null +++ b/ocean_dp/qc/netcdf_gen.py @@ -0,0 +1,118 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from dateutil.parser import parse +from netCDF4 import Dataset, num2date, date2num +from datetime import datetime, timedelta +import sys +from datetime import datetime +import numpy as np +from dateutil import parser +import pytz +import os +import shutil + +# Provide the function with a filename (don't include .nc), a nominal depth, +# and pairs of names and arrays containing the data to be included as variables. +# A time dimension/variable is created by default, starting at 01/01/2020 using +# 1 hour timestamps + +# For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) + +def netcdf_gen(file_name,nominal_depth,*args): + + # Convert the args tuple to a list + args = list(args) + + # Check the args are paired + if len(args) % 2 == 0: + + # Assign the names and data to lists + var_names = args[0::2] + + var_data = args[1::2] + + # Check if first of each pair is a string + if all(isinstance(x, str) for x in var_names): + + # Check if second of each pair are all equal in shape + if all(np.shape(var_data[1]) == np.shape(x) for x in var_data): + + # Create the netcdf with IMOS tag + ds = Dataset("IMOS_" + file_name + ".nc","w", format="NETCDF4") + + # Create time dimension with length to match data + time_dim = ds.createDimension("TIME", len(var_data[0])) + + time_var = ds.createVariable("TIME","f8",("TIME")) + + ds.variables['TIME'][:] = np.arange(25567,25567+(1/24)*len(var_data[1]),1/24) + + time_atts = ['long_name','time','units','days since 1950-01-01 00:00:00 UTC', + 'calendar','gregorian','axis','T','standard_name','time','valid_max', + 90000,'valid_min',0] + + for att_name,att_value in zip(time_atts[0::2],time_atts[1::2]): + + time_var.setncattr(att_name,att_value) + + # Create the nominal depth variable + nom_depth_var = ds.createVariable("NOMINAL_DEPTH","f8") + + ds.variables["NOMINAL_DEPTH"] = nominal_depth + + nom_dep_atts = ['long_name','nominal depth','units','m', + 'positive','down','axis','Z','standard_name','depth','valid_max', + 12000,'valid_min',-5,'reference_datum','sea surface'] + + for att_name,att_value in zip(nom_dep_atts[0::2],nom_dep_atts[1::2]): + + nom_depth_var.setncattr(att_name,att_value) + + # Create variables from input data + for name_in,data_in in zip(var_names,var_data): + + ds.createVariable(name_in,"f8",("TIME")) + + ds.variables[name_in][:] = data_in + + ds.close() + + else: + print('Data arrays not of equal length') + + + else: + print('Labels not in string format') + + else: + print('Data not passed in pairs') + + + + + + + + + + + + + + + + + From c61b8ba5ddd223ef3d32db43683a327e92a4b57a Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 17 Feb 2020 16:19:55 +1100 Subject: [PATCH 15/59] Update select_in_water.py --- ocean_dp/qc/select_in_water.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index 02ccb49..68f57be 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -27,6 +27,7 @@ import os import shutil +# Submit argument as a list def select_in_water(netCDFfiles): @@ -36,7 +37,7 @@ def select_in_water(netCDFfiles): for fn in netCDFfiles: # Check the file is an IMOS formatted file - if fn.split('_')[0]=='IMOS' + if fn.split('_')[0]=='IMOS': # Change the creation date in the filename to today now=datetime.utcnow() @@ -45,7 +46,7 @@ def select_in_water(netCDFfiles): fn_new_split = fn_new.split('_') - fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" fn_new = '_'.join(fn_new_split) @@ -100,6 +101,12 @@ def select_in_water(netCDFfiles): ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + # update the time coverage attributes + + ds.time_coverage_start = ods.time_deployment_start + + ds.time_coverage_end = ods.time_deployment_end + # update the history attribute try: hist = ds.history + "\n" From 04badbfce75d0df9f21055b11854ae33516d81cd Mon Sep 17 00:00:00 2001 From: Peter Jansen Date: Tue, 18 Feb 2020 09:31:04 +1100 Subject: [PATCH 16/59] Update select_in_water.py Remove some white space, able to operate on non IMOS file names, TODO: find TIME dimension (or the dimension of the TIME variable) --- ocean_dp/qc/select_in_water.py | 148 ++++++++++++++++----------------- 1 file changed, 72 insertions(+), 76 deletions(-) diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index 68f57be..5118ae8 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -32,91 +32,87 @@ def select_in_water(netCDFfiles): new_name = [] # list of new file names + now = datetime.utcnow() # loop over all file names given for fn in netCDFfiles: # Check the file is an IMOS formatted file - if fn.split('_')[0]=='IMOS': - + if fn.split('_')[0]=='IMOS': + fn_new = fn_new.replace("FV00", "FV01") + fn_new_split = fn_new.split('_') # Change the creation date in the filename to today - now=datetime.utcnow() - - fn_new = fn.replace("FV00", "FV01") - - fn_new_split = fn_new.split('_') - - fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" - + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" fn_new = '_'.join(fn_new_split) + else: + fn_new = fn.replace(".nc", "-trim.nc") - # Add the new file name to the list of new file names - new_name.append(fn_new) - - # Load the original netcdf file - ods = Dataset(fn,'a') - - # Extract the time dimension, and the deployment start and end - time = np.array(ods.variables['TIME'][:]) - - inw = parse(ods.time_deployment_start) - outw = parse(ods.time_deployment_end) - - # Convert the start and end to the number format used in TIME - inw_num = date2num(inw.replace(tzinfo=None),units = ods.variables['TIME'].units) - outw_num = date2num(outw.replace(tzinfo=None),units = ods.variables['TIME'].units) - - # Create logical index of deployed times - deployed = np.logical_and(time>=inw_num,time<=outw_num) - - # Determine the length of the new time dimension - time_dim = len(time[deployed]) - - # Create the new netcdf file - ds = Dataset(fn_new, "w", format="NETCDF4") - - TIME = ds.createDimension("TIME",time_dim) - - # Copy global attributes - for att in ods.ncattrs(): - - ds.setncattr(att,ods.getncattr(att)) - - # Copy variables - for v_name, varin in ods.variables.items(): - - varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) - - # Copy variable attributes - varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) + # Add the new file name to the list of new file names + new_name.append(fn_new) + + # Load the original netcdf file + ods = Dataset(fn,'a') + + # Extract the time dimension, and the deployment start and end + # TODO: check this works + v = nc.get_variables_by_attributes(standard_name='time') + time = np.array(v[0][:]) + + inw = parse(ods.time_deployment_start) + outw = parse(ods.time_deployment_end) + + # Convert the start and end to the number format used in TIME + inw_num = date2num(inw.replace(tzinfo=None), units=ods.variables['TIME'].units) + outw_num = date2num(outw.replace(tzinfo=None), units=ods.variables['TIME'].units) + + # Create logical index of deployed times + deployed = np.logical_and(time>=inw_num, time<=outw_num) + + # Determine the length of the new time dimension + time_dim_len = len(time[deployed]) + + # Create the new netcdf file + ds = Dataset(fn_new, "w", format="NETCDF4") + + new_time_dim = ds.createDimension("TIME", time_dim_len) + + # Copy global attributes + for att in ods.ncattrs(): + ds.setncattr(att, ods.getncattr(att)) + + # Copy variables + for v_name, varin in ods.variables.items(): + + varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) + + # Copy variable attributes + varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) + + # Fill variables with deployed data + # TODO: should check if the dimensions for the variable include TIME, and truncate that dimension + if np.array(varin[:]).size == 1: + varout[:] = varin[:] + else: + varout[:] = np.array(varin[:])[deployed] + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + # update the time coverage attributes + ds.time_coverage_start = ods.time_deployment_start + ds.time_coverage_end = ods.time_deployment_end + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + + ds.close() + ods.close() - # Fill variables with deployed data - if np.array(varin[:]).size == 1: - - varout[:] = varin[:] - - else: - - varout[:] = np.array(varin[:])[deployed] - - ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") - - # update the time coverage attributes - - ds.time_coverage_start = ods.time_deployment_start - - ds.time_coverage_end = ods.time_deployment_end - - # update the history attribute - try: - hist = ds.history + "\n" - except AttributeError: - hist = "" - ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' - - ds.close() - ods.close() + return new_name if __name__ == "__main__": - select_in_water(sys.argv[1:]) \ No newline at end of file + select_in_water(sys.argv[1:]) From e5559ca5e31d0dd90edcd2b1748c6523eac34976 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 19 Feb 2020 11:25:52 +1100 Subject: [PATCH 17/59] Update pressure_interpolator.py --- ocean_dp/processing/pressure_interpolator.py | 184 ++++++++++++------- 1 file changed, 122 insertions(+), 62 deletions(-) diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py index cb929af..ed764c3 100755 --- a/ocean_dp/processing/pressure_interpolator.py +++ b/ocean_dp/processing/pressure_interpolator.py @@ -22,26 +22,54 @@ import os import shutil -def pressure_interpolator: - - # Load the filenames of the FV00 files in the current folder - fv00_files = glob.glob('*FV00*.nc'); +# Supply netCDFfiles as a ['list'] of files, agg as a 'string' - # Extract the aggregate file data - agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r") +def pressure_interpolator(netCDFfiles = None,agg = None): + + if netCDFfiles==None: - # Loop through each of the FV00 files - for i in fv00_files: + # Load the filenames of the fv01 files in the current folder + netCDFfiles = glob.glob('*FV01*.nc') + + if agg ==None: - # Extract the contents of the current file - fv00_contents = Dataset(i, mode="r") + # Extract the aggregate file data + agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r") + + else: + + agg = Dataset(glob.glob(agg)[0], mode="r") + + # Loop through each of the fv01 files + for fn in netCDFfiles: + + # Change the creation date in the filename to today + now=datetime.utcnow() + + fn_new_split = fn.split('_') + + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" + + fn_new_split[2] += 'IP' + + fn_new = '_'.join(fn_new_split) + + + # If a new (different) filename has been successfully generated, make + # a copy of the old file with the new name + if fn_new != fn: + # copy file + shutil.copy(fn, fn_new) + + # Open and work in the new copy + fv01_contents = Dataset(fn_new,mode='a') # Check the current file doesn't contain pressure to run the following # interpolator - if not 'PRES' in fv00_contents.variables: + if not 'PRES' in fv01_contents.variables: # Create a NaN array to fill with pressure values - interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan) + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) # Set the first row as zeros to set 0m as 0dbar interp_agg_pres[0,:] = 0 @@ -50,14 +78,14 @@ def pressure_interpolator: # including the 0m values agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) - # For each nominal depth, interpolate the agg data at the FV00 times + # For each nominal depth, interpolate the agg data at the fv01 times for j in range(1,len(agg_nominal_depths)): time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] - interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection) + interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection) # Sort the nominal depths and pressures according to nominal depth interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] @@ -71,7 +99,7 @@ def pressure_interpolator: interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) # Find all the columns where the lowest element is NaN - nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist() + nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna()) # Select each column containing an NaN as the deepest value for j in nan_cols: @@ -94,85 +122,117 @@ def pressure_interpolator: # Convert the DataFrame back to an array interp_agg_pres = interp_agg_pres_df.to_numpy() - # Create a NaN array to receive the FV00 interpolated pressures - interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan) + # Create a NaN array to receive the fv01 interpolated pressures + interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan) - # At each timestamp, interpolate pressure for the FV00 data - for j in range(len(fv00_contents.variables["TIME"])): + # At each timestamp, interpolate pressure for the fv01 data + for j in range(len(fv01_contents.variables["TIME"])): - interp_fv00_pres[j] = np.interp(fv00_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j]) + interp_fv01_pres[j] = np.interp(fv01_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j]) - # Use methods from add_qc_flags to make a new netcdf? + # Create the PRES and PRES_quality_control variables, and their attributes + + pres_var = fv01_contents.createVariable('PRES','f8',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True) + + pres_atts = ['long_name','sea_water_pressure_due_to_sea_water','units','dbar', + 'standard_name','coordinates','TIME LATITUDE LONGITUDE NOMINAL_DEPTH','sea_water_pressure_due_to_sea_water','valid_max', + 12000,'valid_min',-15] + + for att_name,att_value in zip(pres_atts[0::2],pres_atts[1::2]): + + pres_var.setncattr(att_name,att_value) + + pres_var[:] = interp_fv01_pres + + + pres_qc_var = fv01_contents.createVariable('PRES_quality_control','i1',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True) + + pres_qc_var.long_name = "quality_code for PRES" + + pres_qc_var.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + + pres_qc_var.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + pres_qc_var[:] = 7 + + pres_var.ancillary_variables = "PRES_quality_control" + + # Close the netcdf files + + fv01_contents.close() + + agg.close() - # Deal with files that already contain pressure, but may contain NaNs - else: - # Create a NaN array to fill with pressure values - interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv00_contents.variables["TIME"])),np.nan) + # Deal with files that already contain pressure, but contain NaNs + elif any(np.isnan(agg.variables['PRES'][:])): + + # Create a NaN array to fill with pressure values + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) # Set the first row as zeros to set 0m as 0dbar interp_agg_pres[0,:] = 0 - # Set the last row to 5000 to set 5000m as 5000dbar (~seafloor), - # only for interpolation in cases where the deepest sensor has failed - #interp_agg_pres[-1,:] = 5000 - # Create a new array representing the nominal depths of the agg file, # including the 0m values agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) - # For each nominal depth, interpolate the agg data at the FV00 times + # For each nominal depth, interpolate the agg data at the fv01 times for j in range(1,len(agg_nominal_depths)): time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] - interp_agg_pres[j,:] = np.interp(fv00_contents.variables["TIME"][:],time_selection,pres_selection) + interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection) # Sort the nominal depths and pressures according to nominal depth interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] agg_nominal_depths.sort() + + # Make a dataframe of the interpolated pressure to handle NaNs easily + interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) - # If there are any NaN values, linearly interpolate profilewise - if np.isnan(np.sum(interp_agg_pres)): + # Find all the columns where the lowest element is NaN + nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna()) + + # Select each column containing an NaN as the deepest value + for j in nan_cols: - # Make a dataframe of the interpolated pressure to handle NaNs easily - interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) + # Find the shallowest nominal depth that isn't NaN + shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) - # Find all the columns where the lowest element is NaN - nan_cols = interp_agg_pres_df[interp_agg_pres_df[-1:].isna()].tolist() + # Find the index of that nominal depth + shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) - # Select each column containing an NaN as the deepest value - for j in nan_cols: + # Starting at the shallowest NaN in a continous block of NaNs to the bottom + for k in range(shallowest_idx+1,len(interp_agg_pres_df)): - # Find the shallowest nominal depth that isn't NaN - shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) - - # Find the index of that nominal depth - shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) + # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar + interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] - # Starting at the shallowest NaN in a continous block of NaNs to the bottom - for k in range(shallowest_idx+1,len(interp_agg_pres_df)): - - # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar - interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] - - # Linearly interpolate any remaining NaNs - interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") - - # Convert the DataFrame back to an array - interp_agg_pres = interp_agg_pres_df.to_numpy() - - # Create a NaN array to receive the FV00 interpolated pressures - interp_fv00_pres = np.full((np.shape(fv00_contents.variables["TIME"][:])),np.nan) + # Linearly interpolate any remaining NaNs + interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") + # Convert the DataFrame back to an array + interp_agg_pres = interp_agg_pres_df.to_numpy() + + # Create a NaN array to receive the fv01 interpolated pressures + interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan) + # Extract the interpolated pressures (NaNs removed) to store in netCDF4 - interp_fv00_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv00_contents.variables["NOMINAL_DEPTH"][:]] + interp_fv01_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv01_contents.variables["NOMINAL_DEPTH"][:]] + + # Find indices where the netcdf data and interpolated data don't match (where the NaNs are in the netcdf) + nan_rep_idx = np.where(interp_fv01_pres!=fv01_contents.variables['PRES'][:])[1] + + fv01_contents.variables['PRES_quality_control'][nan_rep_idx] = 7 + + # Insert pressure value with NaNs interpolated back into netcdf + fv01_contents.variables['PRES'][:] = interp_fv01_pres + + fv01_contents.close() - # - - From 500830bb2173269f0e6bc5d27d39317f3d4584a9 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 19 Feb 2020 15:40:05 +1100 Subject: [PATCH 18/59] Update pressure_interpolator.py --- ocean_dp/processing/pressure_interpolator.py | 34 ++++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py index ed764c3..af248c2 100755 --- a/ocean_dp/processing/pressure_interpolator.py +++ b/ocean_dp/processing/pressure_interpolator.py @@ -24,14 +24,18 @@ # Supply netCDFfiles as a ['list'] of files, agg as a 'string' -def pressure_interpolator(netCDFfiles = None,agg = None): +def pressure_interpolator(netCDFfiles = [],agg = []): - if netCDFfiles==None: + if netCDFfiles==[]: + + print('netcdffiles = none') # Load the filenames of the fv01 files in the current folder netCDFfiles = glob.glob('*FV01*.nc') - if agg ==None: + if agg == []: + + print('agg = none') # Extract the aggregate file data agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r") @@ -43,6 +47,8 @@ def pressure_interpolator(netCDFfiles = None,agg = None): # Loop through each of the fv01 files for fn in netCDFfiles: + print('File selected is '+fn) + # Change the creation date in the filename to today now=datetime.utcnow() @@ -58,16 +64,25 @@ def pressure_interpolator(netCDFfiles = None,agg = None): # If a new (different) filename has been successfully generated, make # a copy of the old file with the new name if fn_new != fn: + + print('copying file') # copy file shutil.copy(fn, fn_new) # Open and work in the new copy fv01_contents = Dataset(fn_new,mode='a') + print('copied file opened') + # Check the current file doesn't contain pressure to run the following # interpolator if not 'PRES' in fv01_contents.variables: + print("file doesn't contain pressure") + + print(fv01_contents.variables.keys()) + print(agg.variables.keys()) + # Create a NaN array to fill with pressure values interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) @@ -160,11 +175,11 @@ def pressure_interpolator(netCDFfiles = None,agg = None): # Close the netcdf files fv01_contents.close() - - agg.close() # Deal with files that already contain pressure, but contain NaNs - elif any(np.isnan(agg.variables['PRES'][:])): + elif any(np.isnan(np.array(fv01_contents.variables['PRES'][:]))): + + print("file contains pressure and agg contains NaNs") # Create a NaN array to fill with pressure values interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) @@ -226,13 +241,20 @@ def pressure_interpolator(netCDFfiles = None,agg = None): # Find indices where the netcdf data and interpolated data don't match (where the NaNs are in the netcdf) nan_rep_idx = np.where(interp_fv01_pres!=fv01_contents.variables['PRES'][:])[1] + # fv01_contents.variables['PRES_quality_control'][nan_rep_idx] = 7 + print('QC altered in original press') + # Insert pressure value with NaNs interpolated back into netcdf fv01_contents.variables['PRES'][:] = interp_fv01_pres + print('press altered in orginal press') + fv01_contents.close() + agg.close() + From 959426b9c483ac157c42f4eaf70c61ad437f30d1 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 19 Feb 2020 16:13:13 +1100 Subject: [PATCH 19/59] Update select_in_water.py --- ocean_dp/qc/select_in_water.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index 5118ae8..16cf613 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -39,7 +39,7 @@ def select_in_water(netCDFfiles): # Check the file is an IMOS formatted file if fn.split('_')[0]=='IMOS': - fn_new = fn_new.replace("FV00", "FV01") + fn_new = fn.replace("FV00", "FV01") fn_new_split = fn_new.split('_') # Change the creation date in the filename to today fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" @@ -55,7 +55,7 @@ def select_in_water(netCDFfiles): # Extract the time dimension, and the deployment start and end # TODO: check this works - v = nc.get_variables_by_attributes(standard_name='time') + v = ods.get_variables_by_attributes(standard_name='time') time = np.array(v[0][:]) inw = parse(ods.time_deployment_start) From d8cbcd493728d642d3ba4e07b2bd003a703791e0 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 20 Feb 2020 08:22:39 +1100 Subject: [PATCH 20/59] Update copyDataset.py --- ocean_dp/aggregation/copyDataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py index d23fe7d..95c5998 100644 --- a/ocean_dp/aggregation/copyDataset.py +++ b/ocean_dp/aggregation/copyDataset.py @@ -21,7 +21,7 @@ # http://thredds.aodn.org.au/thredds/catalog/IMOS/ABOS/DA/EAC2000/catalog.html from dateutil.parser import parse - + def aggregate(files, varNames): # split this into createCatalog - copy needed information into structure @@ -244,7 +244,7 @@ def aggregate(files, varNames): filen = 0 # variables we want regardless - varNames = [varNames]+['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'] + varNames.extend(['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH']) # remove any duplicates varNamesOut = set(varNames) From 20fe907d30a01acbb916ddb8190e70e9778916b3 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 20 Feb 2020 08:48:17 +1100 Subject: [PATCH 21/59] Update netcdf_gen.py --- ocean_dp/qc/netcdf_gen.py | 229 +++++++++++++++++++------------------- 1 file changed, 112 insertions(+), 117 deletions(-) diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py index f687f83..d20f998 100755 --- a/ocean_dp/qc/netcdf_gen.py +++ b/ocean_dp/qc/netcdf_gen.py @@ -1,118 +1,113 @@ -# Copyright (C) 2020 Ben Weeding -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -from dateutil.parser import parse -from netCDF4 import Dataset, num2date, date2num -from datetime import datetime, timedelta -import sys -from datetime import datetime -import numpy as np -from dateutil import parser -import pytz -import os -import shutil - -# Provide the function with a filename (don't include .nc), a nominal depth, -# and pairs of names and arrays containing the data to be included as variables. -# A time dimension/variable is created by default, starting at 01/01/2020 using -# 1 hour timestamps - -# For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) - -def netcdf_gen(file_name,nominal_depth,*args): - - # Convert the args tuple to a list - args = list(args) - - # Check the args are paired - if len(args) % 2 == 0: - - # Assign the names and data to lists - var_names = args[0::2] - - var_data = args[1::2] - - # Check if first of each pair is a string - if all(isinstance(x, str) for x in var_names): - - # Check if second of each pair are all equal in shape - if all(np.shape(var_data[1]) == np.shape(x) for x in var_data): - - # Create the netcdf with IMOS tag - ds = Dataset("IMOS_" + file_name + ".nc","w", format="NETCDF4") - - # Create time dimension with length to match data - time_dim = ds.createDimension("TIME", len(var_data[0])) - - time_var = ds.createVariable("TIME","f8",("TIME")) - - ds.variables['TIME'][:] = np.arange(25567,25567+(1/24)*len(var_data[1]),1/24) - - time_atts = ['long_name','time','units','days since 1950-01-01 00:00:00 UTC', - 'calendar','gregorian','axis','T','standard_name','time','valid_max', - 90000,'valid_min',0] - - for att_name,att_value in zip(time_atts[0::2],time_atts[1::2]): - - time_var.setncattr(att_name,att_value) - - # Create the nominal depth variable - nom_depth_var = ds.createVariable("NOMINAL_DEPTH","f8") - - ds.variables["NOMINAL_DEPTH"] = nominal_depth - - nom_dep_atts = ['long_name','nominal depth','units','m', - 'positive','down','axis','Z','standard_name','depth','valid_max', - 12000,'valid_min',-5,'reference_datum','sea surface'] - - for att_name,att_value in zip(nom_dep_atts[0::2],nom_dep_atts[1::2]): - - nom_depth_var.setncattr(att_name,att_value) - - # Create variables from input data - for name_in,data_in in zip(var_names,var_data): - - ds.createVariable(name_in,"f8",("TIME")) - - ds.variables[name_in][:] = data_in - - ds.close() - - else: - print('Data arrays not of equal length') - - - else: - print('Labels not in string format') - - else: - print('Data not passed in pairs') - - - - - - - - - - - - - - - - + # Copyright (C) 2020 Ben Weeding + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by + # the Free Software Foundation, either version 3 of the License, or + # (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + # + # You should have received a copy of the GNU General Public License + # along with this program. If not, see . + from netCDF4 import Dataset, date2num + import sys + from datetime import datetime + import numpy as np + + # Provide the function with a filename (don't include .nc), a nominal depth, + # and pairs of names and arrays containing the data to be included as variables. + # A time dimension/variable is created by default, starting at 01/01/2020 using + # 1 hour timestamps + + # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) + # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN + + def netcdf_gen(file_name, nominal_depth, *args): + # Convert the args tuple to a list + args = list(args) + #print(args, type(args[1])) + + file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these + + # deal with passing nominal depth as a string + if isinstance(nominal_depth, str): + nominal_depth = float(nominal_depth) + print('nominal depth :', nominal_depth) + + # Check the args are paired + if len(args) % 2 == 0: + + # Assign the names and data to lists + var_names = args[0::2] + + # deal with passing data as a string list of values + if isinstance(args[1], str): + var_data = [[float(b) for b in a.split(',')] for a in args[1::2]] + #print('var_data split', var_data) + else: + var_data = args[1::2] + + # Check if first of each pair is a string + if all(isinstance(x, str) for x in var_names): + + # Check if second of each pair are all equal in shape + if all(np.shape(var_data[1]) == np.shape(x) for x in var_data): + + # Create the netcdf with IMOS tag + ds = Dataset(file_name, "w", format="NETCDF4") + + # Create time dimension with length to match data + time_dim = ds.createDimension("TIME", len(var_data[0])) + + time_var = ds.createVariable("TIME", "f8", ("TIME")) + + time_var.setncattr('long_name', 'time') + time_var.setncattr('standard_name', 'time') + time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC') + time_var.setncattr('calendar', 'gregorian') + time_var.setncattr('axis', 'T') + time_var.setncattr('valid_max', 90000) + time_var.setncattr('valid_min', 0) + + t0 = date2num(datetime(2020, 1, 1), units=time_var.units) + ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24) + + # Create the nominal depth variable + nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") + nom_depth_var.setncattr('long_name', 'nominal depth') + nom_depth_var.setncattr('units', 'dbar') + nom_depth_var.setncattr('positive', 'down') + nom_depth_var.setncattr('axis', 'Z') + nom_depth_var.setncattr('valid_max', 12000) + nom_depth_var.setncattr('valid_min', -5) + nom_depth_var.setncattr('reference_datum', 'sea surface') + + ds.variables["NOMINAL_DEPTH"][:] = nominal_depth + + # Create variables from input data + for name_in, data_in in zip(var_names, var_data): + ds.createVariable(name_in, "f8", ("TIME")) + ds.variables[name_in][:] = data_in + + ds.close() + print("generated ", file_name) + + return (file_name) + + else: + print('Data arrays not of equal length') + + + else: + print('Labels not in string format') + + else: + print('Data not passed in pairs') + + + if __name__ == "__main__": + netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:]) \ No newline at end of file From cd6fc8ad08473ebb44a58db05a000f845222521f Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 20 Feb 2020 14:25:14 +1100 Subject: [PATCH 22/59] Update netcdf_gen.py --- ocean_dp/qc/netcdf_gen.py | 155 ++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 74 deletions(-) diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py index d20f998..7eae159 100755 --- a/ocean_dp/qc/netcdf_gen.py +++ b/ocean_dp/qc/netcdf_gen.py @@ -13,101 +13,108 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - from netCDF4 import Dataset, date2num - import sys - from datetime import datetime - import numpy as np +from netCDF4 import Dataset, date2num +import sys +from datetime import datetime +import numpy as np - # Provide the function with a filename (don't include .nc), a nominal depth, - # and pairs of names and arrays containing the data to be included as variables. - # A time dimension/variable is created by default, starting at 01/01/2020 using - # 1 hour timestamps + # Provide the function with a filename (don't include .nc), a nominal depth, +# and pairs of names and arrays containing the data to be included as variables. +# A time dimension/variable is created by default, starting at 01/01/2020 using +# 1 hour timestamps - # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) - # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN + # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) +# from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN - def netcdf_gen(file_name, nominal_depth, *args): - # Convert the args tuple to a list - args = list(args) - #print(args, type(args[1])) +def netcdf_gen(file_name, nominal_depth, *args): + # Convert the args tuple to a list + args = list(args) + #print(args, type(args[1])) - file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these + file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these - # deal with passing nominal depth as a string - if isinstance(nominal_depth, str): - nominal_depth = float(nominal_depth) - print('nominal depth :', nominal_depth) + # deal with passing nominal depth as a string + if isinstance(nominal_depth, str): + nominal_depth = float(nominal_depth) + print('nominal depth :', nominal_depth) - # Check the args are paired - if len(args) % 2 == 0: + # Check the args are paired + if len(args) % 2 == 0: - # Assign the names and data to lists - var_names = args[0::2] + # Assign the names and data to lists + var_names = args[0::2] - # deal with passing data as a string list of values - if isinstance(args[1], str): - var_data = [[float(b) for b in a.split(',')] for a in args[1::2]] - #print('var_data split', var_data) - else: - var_data = args[1::2] + # deal with passing data as a string list of values + if isinstance(args[1], str): + var_data = [[float(b) for b in a.split(',')] for a in args[1::2]] + #print('var_data split', var_data) + else: + var_data = args[1::2] - # Check if first of each pair is a string - if all(isinstance(x, str) for x in var_names): + # Check if first of each pair is a string + if all(isinstance(x, str) for x in var_names): - # Check if second of each pair are all equal in shape - if all(np.shape(var_data[1]) == np.shape(x) for x in var_data): + # Check if second of each pair are all equal in shape + if all(np.shape(var_data[0]) == np.shape(x) for x in var_data): - # Create the netcdf with IMOS tag - ds = Dataset(file_name, "w", format="NETCDF4") + # Create the netcdf with IMOS tag + ds = Dataset(file_name, "w", format="NETCDF4") - # Create time dimension with length to match data - time_dim = ds.createDimension("TIME", len(var_data[0])) + # Create time dimension with length to match data + time_dim = ds.createDimension("TIME", len(var_data[0])) - time_var = ds.createVariable("TIME", "f8", ("TIME")) + time_var = ds.createVariable("TIME", "f8", ("TIME")) - time_var.setncattr('long_name', 'time') - time_var.setncattr('standard_name', 'time') - time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC') - time_var.setncattr('calendar', 'gregorian') - time_var.setncattr('axis', 'T') - time_var.setncattr('valid_max', 90000) - time_var.setncattr('valid_min', 0) + time_var.setncattr('long_name', 'time') + time_var.setncattr('standard_name', 'time') + time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC') + time_var.setncattr('calendar', 'gregorian') + time_var.setncattr('axis', 'T') + time_var.setncattr('valid_max', 90000) + time_var.setncattr('valid_min', 0) - t0 = date2num(datetime(2020, 1, 1), units=time_var.units) - ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24) + t0 = date2num(datetime(2020, 1, 1), units=time_var.units) + ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24) - # Create the nominal depth variable - nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") - nom_depth_var.setncattr('long_name', 'nominal depth') - nom_depth_var.setncattr('units', 'dbar') - nom_depth_var.setncattr('positive', 'down') - nom_depth_var.setncattr('axis', 'Z') - nom_depth_var.setncattr('valid_max', 12000) - nom_depth_var.setncattr('valid_min', -5) - nom_depth_var.setncattr('reference_datum', 'sea surface') + # Create the nominal depth variable + nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") + nom_depth_var.setncattr('long_name', 'nominal depth') + nom_depth_var.setncattr('units', 'dbar') + nom_depth_var.setncattr('positive', 'down') + nom_depth_var.setncattr('axis', 'Z') + nom_depth_var.setncattr('valid_max', 12000) + nom_depth_var.setncattr('valid_min', -5) + nom_depth_var.setncattr('reference_datum', 'sea surface') - ds.variables["NOMINAL_DEPTH"][:] = nominal_depth + ds.variables["NOMINAL_DEPTH"][:] = nominal_depth - # Create variables from input data - for name_in, data_in in zip(var_names, var_data): - ds.createVariable(name_in, "f8", ("TIME")) - ds.variables[name_in][:] = data_in + # Create variables from input data + for name_in, data_in in zip(var_names, var_data): + ds.createVariable(name_in, "f8", ("TIME")) + ds.variables[name_in][:] = data_in - ds.close() - print("generated ", file_name) + ds.close() + print("generated ", file_name) - return (file_name) + return (file_name) - else: - print('Data arrays not of equal length') + else: + print('Data arrays not of equal length') - - else: - print('Labels not in string format') - else: - print('Data not passed in pairs') + else: + print('Labels not in string format') - - if __name__ == "__main__": - netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:]) \ No newline at end of file + else: + print('Data not passed in pairs') + + +if __name__ == "__main__": + netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:]) + + + + + + + \ No newline at end of file From b6d6168086b3a6d71c7c944d41f5c461d0af3d5a Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 20 Feb 2020 15:07:14 +1100 Subject: [PATCH 23/59] Update netcdf_gen.py --- ocean_dp/qc/netcdf_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py index 7eae159..ef403cf 100755 --- a/ocean_dp/qc/netcdf_gen.py +++ b/ocean_dp/qc/netcdf_gen.py @@ -74,7 +74,7 @@ def netcdf_gen(file_name, nominal_depth, *args): time_var.setncattr('valid_min', 0) t0 = date2num(datetime(2020, 1, 1), units=time_var.units) - ds.variables['TIME'][:] = np.arange(t0, t0 + (1 / 24) * len(var_data[1]), 1 / 24) + ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0]-1)),num=len(var_data[0])) # Create the nominal depth variable nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") From 2d5bc85a946355b8ee2e96bad1e5093acbac6470 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 20 Feb 2020 16:19:53 +1100 Subject: [PATCH 24/59] adds qc variables --- ocean_dp/qc/add_qc_flags.py | 5 +++- ocean_dp/qc/netcdf_gen.py | 42 ++++++++++++++++++++++++++++++++++ ocean_dp/qc/select_in_water.py | 2 ++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/ocean_dp/qc/add_qc_flags.py b/ocean_dp/qc/add_qc_flags.py index bd68234..1d1fa0e 100644 --- a/ocean_dp/qc/add_qc_flags.py +++ b/ocean_dp/qc/add_qc_flags.py @@ -101,4 +101,7 @@ def add_qc(netCDFfile): if __name__ == "__main__": - add_qc(sys.argv[1:]) \ No newline at end of file + add_qc(sys.argv[1:]) + + + \ No newline at end of file diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py index ef403cf..32cb37c 100755 --- a/ocean_dp/qc/netcdf_gen.py +++ b/ocean_dp/qc/netcdf_gen.py @@ -17,6 +17,7 @@ import sys from datetime import datetime import numpy as np +import shutil # Provide the function with a filename (don't include .nc), a nominal depth, # and pairs of names and arrays containing the data to be included as variables. @@ -26,6 +27,8 @@ # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) # from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN + + def netcdf_gen(file_name, nominal_depth, *args): # Convert the args tuple to a list args = list(args) @@ -93,10 +96,49 @@ def netcdf_gen(file_name, nominal_depth, *args): ds.createVariable(name_in, "f8", ("TIME")) ds.variables[name_in][:] = data_in + # read the variable names from the netCDF dataset + vars = ds.variables + + # create a list of variables, don't include the 'TIME' variable + # TODO: detect 'TIME' variable using the standard name 'time' + to_add = [] + + for v in vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + # for each variable, add a new ancillary variable _quality_control to each which has 'TIME' as a dimension + for v in to_add: + if "TIME" in vars[v].dimensions: + # print("time dim ", v) + + if v+"_quality_control" not in ds.variables: + ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 + ncVarOut[:] = np.zeros(vars[v].shape) + ncVarOut.long_name = "quality_code for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + + vars[v].ancillary_variables = v + "_quality_control" + + # update the global attributes + ds.file_version = "Level 1 - Quality Controlled Data" + + ds.history = datetime.utcnow().strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.' + + # ADD quality control attributes!! + ds.close() + + #add_qc(file_name) + print("generated ", file_name) return (file_name) + + else: print('Data arrays not of equal length') diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py index 16cf613..8c296c1 100755 --- a/ocean_dp/qc/select_in_water.py +++ b/ocean_dp/qc/select_in_water.py @@ -29,6 +29,8 @@ # Submit argument as a list + + def select_in_water(netCDFfiles): new_name = [] # list of new file names From a5bdf2fef52227be1829dbd1e279e5847a6b8126 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 21 Feb 2020 12:00:00 +1100 Subject: [PATCH 25/59] Update flatline_test.py Removed the add_qc part, modified so it's possibl to set files, target variables, window length, and the qc flag assigned. Tested with files made using netcdf_gen. --- ocean_dp/qc/flatline_test.py | 142 +++++++++++------------------------ 1 file changed, 44 insertions(+), 98 deletions(-) diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py index a4e4ec5..074db32 100755 --- a/ocean_dp/qc/flatline_test.py +++ b/ocean_dp/qc/flatline_test.py @@ -1,28 +1,4 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Mon Feb 3 14:10:41 2020 - -@author: tru050 -""" - -import re -from datetime import datetime, timedelta -from netCDF4 import num2date, date2num -from netCDF4 import stringtochar -import numpy.ma as ma -import sys -from netCDF4 import Dataset -import numpy as np -import argparse -import glob -import pytz -import os - -#!/usr/bin/python3 - -# add_qc_flags -# Copyright (C) 2020 Peter Jansen +# Copyright (C) 2020 Ben Weeding # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -37,111 +13,81 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -# add QC variables to file - - -def add_qc(netCDFfile): - - new_name = [] # list of new file names - - # loop over all file names given - for fn in netCDFfile[1:]: - ds = Dataset(fn, 'a') - - # read the variable names from the netCDF dataset - vars = ds.variables - - # create a list of variables, don't include the 'TIME' variable - # TODO: detect 'TIME' variable using the standard name 'time' - to_add = [] - for v in vars: - #print (vars[v].dimensions) - if v != 'TIME': - to_add.append(v) - - # for each variable, add a new ancillary variable _quality_control to each which has 'TIME' as a dimension - for v in to_add: - if "TIME" in vars[v].dimensions: - # print("time dim ", v) - - ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 - ncVarOut[:] = np.zeros(vars[v].shape) - ncVarOut.long_name = "quality_code for " + v - - vars[v].ancillary_variables = v + "_quality_control" - - # update the file version attribute - ds.file_version = "Level 1 - Quality Controlled Data" - - ds.close() - - # rename the file FV00 to FV01 (imos specific) - fn_new = fn.replace("FV00", "FV01") - new_name.append(fn_new) - - if fn_new != fn: - # copy file - os.copy(fn, fn_new) - - print(fn_new) - - return new_name - - -if __name__ == "__main__": - add_qc(sys.argv) - -############################################################################## +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os -def flatline_test(*target_files,target_vars=[],window=3): +def flatline_test(target_files,target_vars_in=[],window=3,flag=3): - # If files aren't specified, take all the .nc files in the current folder + # If files aren't specified, take all the IMOS.nc files in the current folder if not target_files: - target_files = glob.glob('*.nc') + target_files = glob.glob('IMOS*.nc') # Loop through each files in target_files for current_file in target_files: - # Print each filename print("input file %s" % current_file) # Extract netcdf data into nc - nc = Dataset(current_file, mode="r") + nc = Dataset(current_file, mode="a") # Extract time ncTime = nc.get_variables_by_attributes(standard_name='time') # If target_vars aren't user specified, set it to all the variables of - # the current_file, removing TIME - if target_vars == []: + # the current_file, removing unwanted variables + if target_vars_in == []: target_vars = list(nc.variables.keys()) + # Remove TIME target_vars.remove('TIME') - # Check if file contains quality control variables, and if not create - - if not any("_quality_control" in i for i in target_vars: - - # insert _quality_control variables into file? - # should this be done now, or should we assume it - # will have already been done? - + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in # For each variable, extract the data for current_var in target_vars: var_data = np.array(nc.variables[current_var]) - for i in 0:(len(var_data)-window+1): + print('checking '+current_var) + + # Step through the data, one element at a time, using the window + for i in range(0,(len(var_data)-window+1)): - # This is true if 'window' elements in a row are equal - if len(set(var_data[i:(i+window)])) == 1 + # This is true if 'window' elements in a row are equal + if len(set(var_data[i:(i+window)])) == 1: # set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i:(i+window)] = flag + + nc.history += ' ' + datetime.utcnow().strftime("%Y%m%d:") + 'flatline_test performed, flatlines of '+str(window)+' consecutive values or more were flagged with '+str(flag) + nc.close() From 05abdb91e2365967043dcba3517f6cfa63467d80 Mon Sep 17 00:00:00 2001 From: Peter Jansen Date: Mon, 24 Feb 2020 10:07:35 +1100 Subject: [PATCH 26/59] Update flatline_test.py separate global fine name gobblin --- ocean_dp/qc/flatline_test.py | 53 ++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py index 074db32..db8fec7 100755 --- a/ocean_dp/qc/flatline_test.py +++ b/ocean_dp/qc/flatline_test.py @@ -26,23 +26,31 @@ import pytz import os -def flatline_test(target_files,target_vars_in=[],window=3,flag=3): - - # If files aren't specified, take all the IMOS.nc files in the current folder - if not target_files: - - target_files = glob.glob('IMOS*.nc') + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def flatline_test_all_files(target_vars_in=[], window=3, flag=3): + target_files = glob.glob('IMOS*.nc') + + flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag) + + +def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3): # Loop through each files in target_files for current_file in target_files: - # Print each filename print("input file %s" % current_file) - + # Extract netcdf data into nc nc = Dataset(current_file, mode="a") - - # Extract time + + # run the flat line test + flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag) + + +def flatline_test(nc, target_vars_in=[], window=3, flag=3): + + # Extract time, TODO: This is not used, should we set the window based on time, not samples? ncTime = nc.get_variables_by_attributes(standard_name='time') # If target_vars aren't user specified, set it to all the variables of @@ -56,12 +64,10 @@ def flatline_test(target_files,target_vars_in=[],window=3,flag=3): # Remove any quality_control variables qc_vars = [s for s in target_vars if 'quality_control' in s] - target_vars = [s for s in target_vars if s not in qc_vars] # Remove any variables of single length single_vars = [s for s in target_vars if nc.variables[s].size==1] - target_vars = [s for s in target_vars if s not in single_vars] print('target_vars are '+' '.join(target_vars)) @@ -84,16 +90,17 @@ def flatline_test(target_files,target_vars_in=[],window=3,flag=3): # set corresponding QC value to... nc.variables[current_var+'_quality_control'][i:(i+window)] = flag - - nc.history += ' ' + datetime.utcnow().strftime("%Y%m%d:") + 'flatline_test performed, flatlines of '+str(window)+' consecutive values or more were flagged with '+str(flag) - + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) + nc.close() - - - - - - - - \ No newline at end of file +if __name__ == "__main__": + # usage is + flatline_test(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4])) From 42ef01b2480a95c3e96ffed5778c4a2e09d19313 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 24 Feb 2020 12:16:56 +1100 Subject: [PATCH 27/59] test changes 1st write of spike test, alter name=main and history for flatline --- ocean_dp/qc/flatline_test.py | 7 +- ocean_dp/qc/spike_test | 8 --- ocean_dp/qc/spike_test.py | 120 +++++++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 13 deletions(-) delete mode 100755 ocean_dp/qc/spike_test create mode 100755 ocean_dp/qc/spike_test.py diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py index db8fec7..e211cf4 100755 --- a/ocean_dp/qc/flatline_test.py +++ b/ocean_dp/qc/flatline_test.py @@ -49,9 +49,6 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3): def flatline_test(nc, target_vars_in=[], window=3, flag=3): - - # Extract time, TODO: This is not used, should we set the window based on time, not samples? - ncTime = nc.get_variables_by_attributes(standard_name='time') # If target_vars aren't user specified, set it to all the variables of # the current_file, removing unwanted variables @@ -97,10 +94,10 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3): except AttributeError: hist = "" - nc.setncattr('history', hist + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) nc.close() if __name__ == "__main__": # usage is - flatline_test(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4])) + flatline_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4])) diff --git a/ocean_dp/qc/spike_test b/ocean_dp/qc/spike_test deleted file mode 100755 index 0bb13ea..0000000 --- a/ocean_dp/qc/spike_test +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed Feb 12 16:28:38 2020 - -@author: tru050 -""" - diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py new file mode 100755 index 0000000..c18c177 --- /dev/null +++ b/ocean_dp/qc/spike_test.py @@ -0,0 +1,120 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + target_files = glob.glob('IMOS*.nc') + + spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + +def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test + spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + +def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each variable, extract the data + for current_var in target_vars: + + var_data = np.array(nc.variables[current_var]) + + print('checking '+current_var) + + # Step through the data, one element at a time, starting from the 2nd element + for i in range(1,(len(var_data)-1)): + + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + + # Check for spike exceeding high threshold + if abs(var_data[i]-shoulder_mean) > thresh_high: + + #set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_high + + + # Check for spike exceeding low threshold + elif abs(var_data[i]-shoulder_mean) > thresh_low: + + # set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_low + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) + + nc.close() + +if __name__ == "__main__": + # usage is + spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6])) + + + + + + + From 925101f749a8700c993023a821088918a4522478 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 2 Mar 2020 15:56:30 +1100 Subject: [PATCH 28/59] Update spike_test.py --- ocean_dp/qc/spike_test.py | 134 ++++++++++++++++++++++++++------------ 1 file changed, 92 insertions(+), 42 deletions(-) diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py index c18c177..154f439 100755 --- a/ocean_dp/qc/spike_test.py +++ b/ocean_dp/qc/spike_test.py @@ -50,63 +50,113 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high= def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): - # If target_vars aren't user specified, set it to all the variables of - # the current_file, removing unwanted variables - if target_vars_in == []: + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each variable, extract the data + for current_var in target_vars: + + var_data = np.array(nc.variables[current_var]) + + print('checking '+current_var+' for high spikes') + + # Step through the data, one element at a time, starting from the 2nd element + for i in range(1,(len(var_data)-1)): - target_vars = list(nc.variables.keys()) + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) - # Remove TIME - target_vars.remove('TIME') + # Check for spike exceeding high threshold + if abs(var_data[i]-shoulder_mean) > thresh_high: + + print('High spike found') + + #set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_high + + # # Extract the qc data + # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) + + # # Find all the instances of consecutive 4s, and reset them to 0 + # for i in np.where(current_qc==4)[0][0:-1][np.diff(np.where(current_qc==4)[0])==1]: + + # nc.variables[current_var+'_quality_control'][i:i+2] = 0 + + # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike + low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1] + + #print(low_spike_chk_idx) + + # Remove from the indices those that are either side of a high spike + for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]: + + low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i-1]] - # Remove any quality_control variables - qc_vars = [s for s in target_vars if 'quality_control' in s] - target_vars = [s for s in target_vars if s not in qc_vars] - - # Remove any variables of single length - single_vars = [s for s in target_vars if nc.variables[s].size==1] - target_vars = [s for s in target_vars if s not in single_vars] + low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i+1]] + + #print(low_spike_chk_idx) + + print('checking '+current_var+' for low spikes') + + # For each of the remaining indices + for i in low_spike_chk_idx: - print('target_vars are '+' '.join(target_vars)) + #print('i is '+str(i)) - else: - target_vars = target_vars_in + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) - # For each variable, extract the data - for current_var in target_vars: + #print('shoulder mean is '+str(shoulder_mean)) - var_data = np.array(nc.variables[current_var]) + abs_diff = abs(var_data[i]-shoulder_mean) - print('checking '+current_var) + #print('absolute difference is '+str(abs_diff)) - # Step through the data, one element at a time, starting from the 2nd element - for i in range(1,(len(var_data)-1)): + # Check for spike exceeding low threshold + if abs(var_data[i]-shoulder_mean) > thresh_low: - # Calculate the mean of the i-1 and i+1 elements - shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + print('Low spike found') - # Check for spike exceeding high threshold - if abs(var_data[i]-shoulder_mean) > thresh_high: - - #set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_high + #set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_low + + # # Extract the qc data + # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) + + # # Find all the instances of consecutive 3s, and reset them to 0 + # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]: + + # nc.variables[current_var+'_quality_control'][i:i+2] = 0 - - # Check for spike exceeding low threshold - elif abs(var_data[i]-shoulder_mean) > thresh_low: - - # set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_low - # update the history attribute - try: - hist = nc.history + "\n" - except AttributeError: - hist = "" + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" - nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) - nc.close() + nc.close() if __name__ == "__main__": # usage is From 62b0419a1fcbb368a152728680bf3586c060d77c Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 3 Mar 2020 12:42:42 +1100 Subject: [PATCH 29/59] Update netcdf_gen.py Brought -1 outside brackets, noticed that netcdfgen was not producing hourly data when using for rate of change test. --- ocean_dp/qc/netcdf_gen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py index 32cb37c..1b56de8 100755 --- a/ocean_dp/qc/netcdf_gen.py +++ b/ocean_dp/qc/netcdf_gen.py @@ -77,7 +77,7 @@ def netcdf_gen(file_name, nominal_depth, *args): time_var.setncattr('valid_min', 0) t0 = date2num(datetime(2020, 1, 1), units=time_var.units) - ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0]-1)),num=len(var_data[0])) + ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0])-1),num=len(var_data[0])) # Create the nominal depth variable nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") @@ -96,7 +96,8 @@ def netcdf_gen(file_name, nominal_depth, *args): ds.createVariable(name_in, "f8", ("TIME")) ds.variables[name_in][:] = data_in - # read the variable names from the netCDF dataset + + # read the variable names from the netCDF dataset vars = ds.variables # create a list of variables, don't include the 'TIME' variable From cc0589762a76caea2a6334a918883ff77bfeb02b Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 4 Mar 2020 16:32:16 +1100 Subject: [PATCH 30/59] Create rate_of_change_test.py Haven't yet worked out how to use sys.argv[] with both *args and a keyword argument - line 178 --- ocean_dp/qc/rate_of_change_test.py | 184 +++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100755 ocean_dp/qc/rate_of_change_test.py diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py new file mode 100755 index 0000000..c3e1d9b --- /dev/null +++ b/ocean_dp/qc/rate_of_change_test.py @@ -0,0 +1,184 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +# how to specific rate of change? Will the function take rate of change as an +# argument? + +# Need a way of linking the rate of change to each different variable type + +# Could max rate of change be something we upload into the netcdf file atts? Or just in history? + +# Tell the function how much change you tolerate, and over what period of time - in sec? + +# Then convert to match the file timesteps + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def roc_test_all_files(*args,target_vars_in=[]): + target_files = glob.glob('IMOS*.nc') + + roc_test_files(target_files, target_vars_in=target_vars_in,*args) + +def roc_test_files(target_files,*args,target_vars_in=[]): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + print(args) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items + roc_test(nc,*args, target_vars_in=target_vars_in) + + +# Enter args as variable name and rate of change limit, ie. 'TEMP',4 +def roc_test(nc,*args,target_vars_in=[]): + + # Check the time format + if nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Convert the args tuple to a list + args = list(args) + + # If a single rate of change limit is supplied + if len(args) == 1: + + change_per_hr = args[0] + + print('One rate of change limit will be applied to all variables') + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # For each variable + for current_var in target_vars: + + # Extract the data + var_data = np.array(nc.variables[current_var]) + + # Calculate dvar/dtime + var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) + + # For any change greater than change_per_hr, assign a qc value of 4 + nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'rate of change test performed, with all changes above '+str(change_per_hr)+' flagged as 4') + + + # If multiple rate of change limits are supplied, with variable names + elif len(args) % 2 == 0 and all(isinstance(x,str) for x in args[0::2]) and all(isinstance(y,(float,int)) for y in args[1::2]): + + # Take target variables from args + target_vars = args[0::2] + + print('target_vars are '+' '.join(target_vars)) + + # Convert arguments to dict + rate_spec = dict(zip(args[0::2],args[1::2])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # For each variable + for current_var in target_vars: + + # Extract the data + var_data = np.array(nc.variables[current_var]) + + # Calculate dvar/dtime + var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) + + # For any change greater than change_per_hr, assign a qc value of 4 + nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': rate of change test performed, with all changes above those specified in the following list flagged as 4: '+str(args)) + + + else: + print('Arguments passed do not match the required format. No roc test performed.') + + + # If the time format doesn't match IMOS requirements + else: + print('Time format does not match the required IMOS form of: days since 1950-01-01 00:00:00 UTC') + + + nc.close() + + +# Not sure how to sys.argv[] with both *args and a keyword argument +if __name__ == "__main__": + # usage is <*args> + roc_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], *sys.argv[3:]) + + + + From 24e5efd24cb3c87d7d5273b437ab6ea7e8a1771b Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 11 Mar 2020 19:56:10 +1100 Subject: [PATCH 31/59] Create temp_diff_histograms.py --- ocean_dp/qc/temp_diff_histograms.py | 109 ++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100755 ocean_dp/qc/temp_diff_histograms.py diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py new file mode 100755 index 0000000..2a78a92 --- /dev/null +++ b/ocean_dp/qc/temp_diff_histograms.py @@ -0,0 +1,109 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter + +netcdf_files = [] + +temp_diffs = np.array([]) + +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + for file,dirx in files,dirs: + if file.endswith('.nc'): + netcdf_files.append(file) + nc = Dataset(os.path.join(dirs, file),mode='r') + + temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:])))) + + nc.close() + + +print (list_of_files) + + + + + +files = glob.glob('*.nc') + +temp_diffs = np.array([]) + +for current_file in files: + + nc = Dataset(current_file,mode='r') + + temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:])))) + +fig, ax = plt.subplots() + +ax.hist(temp_diffs,100,log=True) + + +# use os.walk??? to run in each netcdf folder?? os.scandir()? + + + + + +# sofs75_60m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4051-60m_END-20190331_C-20200204.nc',mode='r') +# sofs75_70m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4052-70m_END-20190331_C-20200204.nc',mode='r') +# sofs75_75m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4053-75m_END-20190331_C-20200204.nc',mode='r') + + +# temp_60 = np.array(sofs75_60m.variables['TEMP'][:]) +# temp_70 = np.array(sofs75_70m.variables['TEMP'][:]) +# temp_75 = np.array(sofs75_75m.variables['TEMP'][:]) + +# label_coords = (0.01, 0.85) +# label_method = 'axes fraction' + +# fig, axs = plt.subplots(3, 1, sharey=True) +# axs[0].set_title('Temp sensor comparison SOFS7.5') + +# axs[0].hist(np.diff(temp_60),bins=100,log=True, histtype='bar', stacked=True) +# axs[0].set_ylim(bottom=0.1,top=10E5) +# axs[0].set_xlim(left=-40, right=40) +# axs[0].annotate('60m',xy=label_coords, xycoords=label_method) +# axs[0].tick_params(labelbottom=False) + +# axs[1].hist(np.diff(temp_70),bins=100,log=True) +# axs[1].set_ylim(bottom=0.1,top=10E5) +# axs[1].set_xlim(left=-40, right=40) +# axs[1].annotate('70m',xy=label_coords, xycoords=label_method) +# axs[1].tick_params(labelbottom=False) + +# axs[2].hist(np.diff(temp_75),bins=100,log=True) +# axs[2].set_ylim(bottom=0.1,top=10E5) +# axs[2].set_xlim(left=-40, right=40) +# axs[2].annotate('75m',xy=label_coords, xycoords=label_method) + + + +# fig.savefig('test.pdf') + + + + + + From fd81c7504b9af279448d544fb4b1da1229904d2c Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 16 Mar 2020 18:05:27 +1100 Subject: [PATCH 32/59] plots for qc testing --- ocean_dp/qc/agg_temp_plot.py | 104 ++++++++++++ ocean_dp/qc/in_out_water.py | 10 ++ ocean_dp/qc/temp_diff_hist_extra.py | 67 ++++++++ ocean_dp/qc/temp_diff_hist_glob | 121 ++++++++++++++ ocean_dp/qc/temp_diff_hist_glob.py | 119 ++++++++++++++ ocean_dp/qc/temp_diff_histograms.py | 244 +++++++++++++++++++++++++++- ocean_dp/qc/temp_time_diff_plots.py | 117 +++++++++++++ 7 files changed, 773 insertions(+), 9 deletions(-) create mode 100755 ocean_dp/qc/agg_temp_plot.py create mode 100755 ocean_dp/qc/temp_diff_hist_extra.py create mode 100755 ocean_dp/qc/temp_diff_hist_glob create mode 100755 ocean_dp/qc/temp_diff_hist_glob.py create mode 100755 ocean_dp/qc/temp_time_diff_plots.py diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py new file mode 100755 index 0000000..cb92107 --- /dev/null +++ b/ocean_dp/qc/agg_temp_plot.py @@ -0,0 +1,104 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + + +x=Dataset('IMOS_ABOS-SOTS_COPSTIP_20180822_SOFS_FV02_SOFS-Aggregate-TEMP_END-20190322_C-20200311.nc',mode='r') + +temp = np.array(x.variables['TEMP'][:]) + +time = np.array(x.variables['TIME'][:]) + +ins_idx = np.array(x.variables['instrument_index'][:]) + +fig, ax = plt.subplots(6,5) + +ax=ax.flatten() + +label_coords = (0.1, 0.8) +label_method = 'axes fraction' + +for i in set(np.array(ins_idx)): + + ax[i].plot(time[ins_idx==i],temp[ins_idx==i]) + + ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8) + +i=1 +fig, ax = plt.subplots() +ax.plot(time[ins_idx==i],temp[ins_idx==i]) + + +# Remove bad instruments +good_vals = [a!=14 and a!=15 for a in ins_idx] + +fig, ax = plt.subplots() +ax.hist(temp[good_vals],21) + +sofs75_temp_diffs = np.array([]) + +good_ins = set(np.array(ins_idx)) + +good_ins -= {14,15} + +for i in good_ins: + + cur_temp = temp[ins_idx==i] + + cur_time = time[ins_idx==i] + + cur_time_hr = cur_time*24 + + # Calculate time changes + cur_time_hr_diffs = np.diff(cur_time_hr) + + cur_temp_diffs = np.diff(cur_temp) + + # Calculate the rate of change of temperature wrt time + cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs) + + print('ins '+str(i)+':'+str(np.max(cur_dtemp_dtime))) + + sofs75_temp_diffs = np.concatenate((sofs75_temp_diffs,cur_dtemp_dtime)) + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py index ac14840..2543c23 100644 --- a/ocean_dp/qc/in_out_water.py +++ b/ocean_dp/qc/in_out_water.py @@ -60,6 +60,16 @@ def in_out_water(netCDFfile): ds.file_version = "Level 1 - Quality Controlled Data" + + # update the history attribute + try: + hist = nc.history + "\n" + + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7') + ds.close() diff --git a/ocean_dp/qc/temp_diff_hist_extra.py b/ocean_dp/qc/temp_diff_hist_extra.py new file mode 100755 index 0000000..d8b6019 --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_extra.py @@ -0,0 +1,67 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + ds = Dataset(os.path.join(root,fname), 'a') + + vars = ds.variables + + to_add = [] + for v in vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) + + print(time_deploy) + + print(to_add) + for v in to_add: + if "TIME" in vars[v].dimensions: + + if v.endswith("_quality_control"): + + print("QC time dim ", v) + + ncVarOut = vars[v] + mask = (time <= time_deploy) | (time >= time_recovery) + ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7 + + + ds.file_version = "Level 1 - Quality Controlled Data" + + # update the history attribute + try: + hist = ds.history + "\n" + + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7') + + + ds.close() \ No newline at end of file diff --git a/ocean_dp/qc/temp_diff_hist_glob b/ocean_dp/qc/temp_diff_hist_glob new file mode 100755 index 0000000..a87ca72 --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_glob @@ -0,0 +1,121 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + +fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc") + + +fig, ax = plt.subplots(4,4,sharex='all', sharey='all') + +ax=ax.flatten() + + +for current_deployment, plt_idx in zip(deployments, range(0,16)): + + + + for fname in files: + + if fname.find(current_deployment) and fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for all files + #all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime)) + + #all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime)) + + #netcdffiles.append(fname) + + #mins.append(np.amin(nc_dtemp_dtime)) + + #maxs.append(np.amax(nc_dtemp_dtime)) + + + + nc.close() + + ax[plt_idx].hist(nc_dtemp_dtime,100,log=True) + + ax[plt_idx].set_ylim(bottom=0.1,top=10E5) + + ax[plt_idx].set_xlim(left=-500, right=500) + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/temp_diff_hist_glob.py b/ocean_dp/qc/temp_diff_hist_glob.py new file mode 100755 index 0000000..1e63d14 --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_glob.py @@ -0,0 +1,119 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + +deployment_dtemp_dtime = np.array([]) + +fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc") + + +fig, ax = plt.subplots(4,4,sharex='all', sharey='all') + +ax=ax.flatten() + + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print(current_deployment + 'files') + + for fname in fv01_files: + + if current_deployment in fname: + + #print(fname + ' contains ' + current_deployment) #Here, the wanted file name is printed + + nc = Dataset(fname, mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1: + + print(fname) + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + print('plotting '+ str(len(deployment_dtemp_dtime)) + ' values') + + ax[plt_idx].hist(deployment_dtemp_dtime,100) + + #ax[plt_idx].set_ylim(bottom=0.1,top=10E5) + + #ax[plt_idx].set_xlim(left=-10, right=10) + + deployment_dtemp_dtime = np.array([]) + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py index 2a78a92..2003afc 100755 --- a/ocean_dp/qc/temp_diff_histograms.py +++ b/ocean_dp/qc/temp_diff_histograms.py @@ -14,7 +14,8 @@ # along with this program. If not, see . import numpy.ma as ma import sys -from netCDF4 import Dataset +from netCDF4 import Dataset, num2date +from dateutil import parser import numpy as np import argparse import glob @@ -23,26 +24,249 @@ import matplotlib.pyplot as plt from matplotlib import colors from matplotlib.ticker import PercentFormatter +from sigfig import round -netcdf_files = [] +deployments = [] -temp_diffs = np.array([]) +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + + +# check for in water test in history of netcdf file, if not perform the test + +netcdffiles = [] + +mins=[] + +maxs=[] + +all_dtemp_dtime = np.array([]) + +all_dtemp_dtime_deps = [] for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): - for file,dirx in files,dirs: - if file.endswith('.nc'): - netcdf_files.append(file) - nc = Dataset(os.path.join(dirs, file),mode='r') + + for fname in files: + + if fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) - temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:])))) + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for all files + all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime)) + + all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime)) + + netcdffiles.append(fname) + + mins.append(np.amin(nc_dtemp_dtime)) + + maxs.append(np.amax(nc_dtemp_dtime)) nc.close() + + +fig, ax = plt.subplots() + +bins = np.linspace(-450,450,901) + +line_thick = 0.5 + +counts,bins,bars = ax.hist(all_dtemp_dtime,bins,log=True) + +ax.axvline(x=3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) + +ax.axvline(x=-3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) + +ax.set_title('Hourly temp changes from all FV01 files in SOTS-TEMP-Raw_Data') + +label_coords = (0.01, 0.9) + +label_method = 'axes fraction' + +ax.annotate('~1.84E7 measurements',xy=label_coords, xycoords=label_method) + + + +def last_four(entry): + + output = entry[-4::] + + return output + + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + +deployments.sort(key=last_four) + + + + + + + +all_deployment_dtemp_dtime = [None] * len(deployments) + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + #print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime + + nc.close() + + + + + + + + +fig, ax = plt.subplots(4,4) + +ax=ax.flatten() + +line_thick = 1 + +label_coords = (0.6, 0.6) +label_method = 'axes fraction' + +for plt_idx,dep_name in zip(range(0,len(deployments)),deployments): + + print('plotting '+ str(len(all_deployment_dtemp_dtime[plt_idx])) + ' values') + hist_data = ax[plt_idx].hist(all_deployment_dtemp_dtime[plt_idx],21,log=True) + + ax[plt_idx].set_title(dep_name,fontsize=10) + + #ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick) + + ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])+3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) -print (list_of_files) + ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])-3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) + anno = 'mean = '+str(round(float(np.mean(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3)) + anno += '\n3SD = ' + str(round(float(3*np.std(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3)) + + anno += '\nsamples = ' + str(len(all_deployment_dtemp_dtime[plt_idx])) + + ax[plt_idx].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + + #ax[plt_idx].set_ylim(bottom=0,top=np.max(hist_data[0])) + + #ax[plt_idx].set_xlim(left=-450, right=450) np.linspace(-450,450,901) + +#ax[-1].axis('off') + +all_data = np.concatenate(all_deployment_dtemp_dtime) + +hist_data = ax[15].hist(all_data,21,log=True) + +ax[15].set_title('All data',fontsize=10) + +#ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick) + +ax[15].axvline(x=np.mean(all_data)+3*np.std(all_data),color='r',linewidth=line_thick) + +ax[15].axvline(x=np.mean(all_data)-3*np.std(all_data),color='r',linewidth=line_thick) + +anno = 'mean = '+str(round(float(np.mean(all_data)),sigfigs=3)) + +anno += '\n3SD = ' + str(round(float(3*np.std(all_data)),sigfigs=3)) + +anno += '\nsamples = ' + str(len(all_data)) + +ax[15].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + +fig.subplots_adjust(left=0.05,right=0.99,bottom=0.1,top=0.9,wspace=0.15,hspace=0.4) + + + + + + + + + + + + files = glob.glob('*.nc') @@ -60,6 +284,8 @@ ax.hist(temp_diffs,100,log=True) + + # use os.walk??? to run in each netcdf folder?? os.scandir()? diff --git a/ocean_dp/qc/temp_time_diff_plots.py b/ocean_dp/qc/temp_time_diff_plots.py new file mode 100755 index 0000000..cf42a13 --- /dev/null +++ b/ocean_dp/qc/temp_time_diff_plots.py @@ -0,0 +1,117 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + + + +def last_four(entry): + + output = entry[-4::] + + return output + + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + +deployments.sort(key=last_four) + + +fig, ax = plt.subplots(4,4) + +ax=ax.flatten() + + +all_deployment_dtemp_dtime = [None] * len(deployments) + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + #print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + ax[plt_idx].plot(nc_time,temp_extract) + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime + + nc.close() + + + + + + + + From 222024a2dad9ce9e57992abc112c43a968908adc Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 17 Mar 2020 16:41:42 +1100 Subject: [PATCH 33/59] new plotting codes --- ocean_dp/qc/agg_temp_plot.py | 16 +- ocean_dp/qc/temp_diff_timeseries_from_fv00.py | 156 ++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) create mode 100755 ocean_dp/qc/temp_diff_timeseries_from_fv00.py diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py index cb92107..9902493 100755 --- a/ocean_dp/qc/agg_temp_plot.py +++ b/ocean_dp/qc/agg_temp_plot.py @@ -45,7 +45,21 @@ for i in set(np.array(ins_idx)): - ax[i].plot(time[ins_idx==i],temp[ins_idx==i]) + cur_temp = temp[ins_idx==i] + + cur_time = time[ins_idx==i] + + cur_time_hr = cur_time*24 + + # Calculate time changes + cur_time_hr_diffs = np.diff(cur_time_hr) + + cur_temp_diffs = np.diff(cur_temp) + + # Calculate the rate of change of temperature wrt time + cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs) + + ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),cur_dtemp_dtime)),cmap='cool') ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8) diff --git a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py new file mode 100755 index 0000000..b21a605 --- /dev/null +++ b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py @@ -0,0 +1,156 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + +def last_four(entry): + + output = entry[-4::] + + return output + + +def sp_layout(num_in): + + sp_nums = np.array([1,2,4,6,9,12,16,20,25,30]) + + sp_dict={1:[1,1],2:[2,1],4:[2,2],6:[3,2],9:[3,3],12:[4,3],16:[4,4],20:[5,4],25:[5,5],30:[6,5]} + + return sp_dict[sp_nums[np.where(num_in<=sp_nums)[0][0]]] + + + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + + deployments.append(x) + +deployments.sort(key=last_four) + + + + + + +for current_deployment in deployments: + + acceptable_files = [] + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + print('checking '+fname) + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + print('opening '+fname) + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + acceptable_files.append(fname) + + print(fname+' accepted') + + nc.close() + + fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1]) + + ax=ax.flatten() + + for fname,f_idx in zip(acceptable_files, range(0,len(acceptable_files))): + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) | (time < time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + ax[f_idx].scatter(nc_time,nc_dtemp_dtime) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + + + + + + + + + + + + + + + + + + + + + + + + From d0f4a8e76dd62fd049b4e27061c181365e0d97d2 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 24 Mar 2020 17:31:02 +1100 Subject: [PATCH 34/59] various bits --- ocean_dp/qc/agg_temp_plot.py | 23 ++- ocean_dp/qc/arg_tester.py | 38 ++++ ocean_dp/qc/rate_of_change_test.py | 4 + ocean_dp/qc/spike_test.py | 20 +- ocean_dp/qc/spike_test_ver_2.py | 149 ++++++++++++++ ocean_dp/qc/temp_diff_timeseries_from_fv00.py | 183 ++++++++++++++---- 6 files changed, 373 insertions(+), 44 deletions(-) create mode 100755 ocean_dp/qc/arg_tester.py create mode 100755 ocean_dp/qc/spike_test_ver_2.py diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py index 9902493..71ab427 100755 --- a/ocean_dp/qc/agg_temp_plot.py +++ b/ocean_dp/qc/agg_temp_plot.py @@ -43,6 +43,17 @@ label_coords = (0.1, 0.8) label_method = 'axes fraction' +# cmap = colors.ListedColormap(['black','green','blue','red','orange']) + +# boundaries = [0,5,10,20,40,80] + +cmap = colors.ListedColormap(['blue','orange','red']) + +boundaries = [0,20,30,500] + +norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True) + + for i in set(np.array(ins_idx)): cur_temp = temp[ins_idx==i] @@ -59,9 +70,17 @@ # Calculate the rate of change of temperature wrt time cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs) - ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),cur_dtemp_dtime)),cmap='cool') + im = ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),np.abs(cur_dtemp_dtime))),cmap=cmap,norm=norm) + + #ax[i].set_title(,fontsize=10) + + ax[i].annotate('Ins:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8) + + if i==27: + fig.colorbar(im) + - ax[i].annotate('S:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8) +fig.colorbar(cmap) i=1 fig, ax = plt.subplots() diff --git a/ocean_dp/qc/arg_tester.py b/ocean_dp/qc/arg_tester.py new file mode 100755 index 0000000..2bfe7b4 --- /dev/null +++ b/ocean_dp/qc/arg_tester.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +def learn_arg_sys(text_in): + + print(text_in) + +# USE runfile('arg_tester.py', args='bing') + + +#if __name__ == "__main__": + # usage is <*args> +learn_arg_sys(text_in=sys.argv[1]) \ No newline at end of file diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py index c3e1d9b..f4e187b 100755 --- a/ocean_dp/qc/rate_of_change_test.py +++ b/ocean_dp/qc/rate_of_change_test.py @@ -114,6 +114,8 @@ def roc_test(nc,*args,target_vars_in=[]): # For any change greater than change_per_hr, assign a qc value of 4 nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + + print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour') # update the history attribute try: @@ -152,6 +154,8 @@ def roc_test(nc,*args,target_vars_in=[]): # For any change greater than change_per_hr, assign a qc value of 4 nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + + print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour') # update the history attribute try: diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py index 154f439..a061c32 100755 --- a/ocean_dp/qc/spike_test.py +++ b/ocean_dp/qc/spike_test.py @@ -28,13 +28,13 @@ # If files aren't specified, take all the IMOS*.nc files in the current folder -def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): +def spike_test_all_files(target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): target_files = glob.glob('IMOS*.nc') spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) -def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): +def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): # Loop through each files in target_files for current_file in target_files: @@ -48,7 +48,7 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high= spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) -def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): +def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): # If target_vars aren't user specified, set it to all the variables of # the current_file, removing unwanted variables @@ -77,6 +77,8 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f var_data = np.array(nc.variables[current_var]) + + print('checking '+current_var+' for high spikes') # Step through the data, one element at a time, starting from the 2nd element @@ -85,8 +87,11 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f # Calculate the mean of the i-1 and i+1 elements shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + # Check for spike exceeding high threshold - if abs(var_data[i]-shoulder_mean) > thresh_high: + if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)):# & (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])): print('High spike found') @@ -125,14 +130,17 @@ def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, f # Calculate the mean of the i-1 and i+1 elements shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + #print('shoulder mean is '+str(shoulder_mean)) - abs_diff = abs(var_data[i]-shoulder_mean) + #abs_diff = abs(var_data[i]-shoulder_mean) #print('absolute difference is '+str(abs_diff)) # Check for spike exceeding low threshold - if abs(var_data[i]-shoulder_mean) > thresh_low: + if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)): #& (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])): print('Low spike found') diff --git a/ocean_dp/qc/spike_test_ver_2.py b/ocean_dp/qc/spike_test_ver_2.py new file mode 100755 index 0000000..68fe69e --- /dev/null +++ b/ocean_dp/qc/spike_test_ver_2.py @@ -0,0 +1,149 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + target_files = glob.glob('IMOS*.nc') + + spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + +def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test + spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + + +def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each variable, extract the data + for current_var in target_vars: + + var_data = np.array(nc.variables[current_var]) + + print('checking '+current_var+' for high spikes') + + # Step through the data, one element at a time, starting from the 2nd element + for i in range(1,(len(var_data)-1)): + + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + + # Check for spike exceeding high threshold + if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)): + + print('High spike found') + + #set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_high + + print('checking '+current_var+' for low spikes') + + # For each of the remaining indices + for i in low_spike_chk_idx: + + #print('i is '+str(i)) + + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + + #print('shoulder mean is '+str(shoulder_mean)) + + abs_diff = abs(var_data[i]-shoulder_mean) + + #print('absolute difference is '+str(abs_diff)) + + # Check for spike exceeding low threshold + if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)): + + print('Low spike found') + + #set corresponding QC value to... + nc.variables[current_var+'_quality_control'][i] = flag_low + + # # Extract the qc data + # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) + + # # Find all the instances of consecutive 3s, and reset them to 0 + # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]: + + # nc.variables[current_var+'_quality_control'][i:i+2] = 0 + + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) + + nc.close() + +if __name__ == "__main__": + # usage is + spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6])) + + \ No newline at end of file diff --git a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py index b21a605..9a38ccf 100755 --- a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py +++ b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py @@ -44,11 +44,19 @@ def sp_layout(num_in): +cmap = colors.ListedColormap(['blue','orange','red']) + +boundaries = [0,20,30,2000] + +norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True) + + + deployments = [] -for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): +for x in next(os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"))[1]: - if ('Pulse' in x) or ('SOFS' in x): + if (('Pulse' in x) or ('SOFS' in x)): deployments.append(x) @@ -64,6 +72,8 @@ def sp_layout(num_in): acceptable_files = [] + acceptable_depths = [] + print('current deployment is '+current_deployment) deployment_dtemp_dtime = np.array([]) @@ -82,57 +92,158 @@ def sp_layout(num_in): if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': - acceptable_files.append(fname) + acceptable_files.append(os.path.join(root,fname)) + + acceptable_depths.append(nc.instrument_nominal_depth) print(fname+' accepted') nc.close() - - fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1]) - - ax=ax.flatten() - for fname,f_idx in zip(acceptable_files, range(0,len(acceptable_files))): + + acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))] - nc = Dataset(os.path.join(root,fname), mode = 'r') + + fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1],figsize=((12,8)),constrained_layout=True,sharex=True) + + ax=ax.flatten() + + fig.suptitle(current_deployment, fontsize=14) + - time_var = nc.variables["TIME"] - - time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) - - time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) - - time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + for fpath,f_idx in zip(acceptable_files, range(0,len(acceptable_files))): - #print('using '+fname) + nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!? - temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) | (time < time_recovery)]) - - # Calculate temperature changes - nc_temp_diffs = np.diff(temp_extract) - - # Extract the time data - nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + time_var = nc.variables["TIME"] - # Convert from days to hours - nc_time_hr = nc_time*24 - - # Calculate time changes - nc_time_hr_diffs = np.diff(nc_time_hr) - - # Calculate the rate of change of temperature wrt time - nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + im=ax[f_idx].scatter(nc_time,temp_extract,s=0.2,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime))),cmap=cmap,norm=norm) + + im=ax[f_idx].scatter(nc_time[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],temp_extract[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],s=0.5,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],cmap=cmap,norm=norm) + + ax[f_idx].set_title(str(nc.instrument_nominal_depth)+'m',fontsize=10) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + if f_idx==0: - ax[f_idx].scatter(nc_time,nc_dtemp_dtime) + fig.colorbar(im) - # Add the results for this netcdf to the record for the deployment - deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + for f_idx in range(len(acceptable_files),len(ax)): - nc.close() + ax[f_idx].set_axis_off() +############################################################################## +fig, ax = plt.subplots(sp_layout(len(deployments))[0],sp_layout(len(deployments))[1],figsize=((12,8)),constrained_layout=True,sharex=False) +ax=ax.flatten() + +for current_deployment,d_idx in zip(deployments,range(0,len(deployments))): + + acceptable_files = [] + + acceptable_depths = [] + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + print('checking '+fname) + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + print('opening '+fname) + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + acceptable_files.append(os.path.join(root,fname)) + + acceptable_depths.append(nc.instrument_nominal_depth) + + print(fname+' accepted') + + nc.close() + + + acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))] + + + for fpath in acceptable_files: + + nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!? + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + ax[d_idx].hist(deployment_dtemp_dtime,21,log=True) + + ax[d_idx].set_title(current_deployment,fontsize=10) + From 5b390327300e528da818f7b281746b248a63f407 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 21 Apr 2020 17:15:06 +1000 Subject: [PATCH 35/59] QC and preprocessing --- ocean_dp/aggregation/copyDataset.py | 73 ++++----- ocean_dp/processing/pressure_interpolator.py | 7 +- ocean_dp/qc/flatline_test.py | 27 +++- ocean_dp/qc/global_range.py | 69 +++++++-- ocean_dp/qc/in_out_water.py | 66 ++++---- ocean_dp/qc/rate_of_change_test.py | 43 +++++- ocean_dp/qc/spike_test.py | 64 ++++---- ocean_dp/qc/spike_test_ver_2.py | 149 ------------------- ocean_dp/sots_processing_runthrough.py | 74 +++++++++ 9 files changed, 306 insertions(+), 266 deletions(-) delete mode 100755 ocean_dp/qc/spike_test_ver_2.py create mode 100755 ocean_dp/sots_processing_runthrough.py diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py index 95c5998..54eb960 100644 --- a/ocean_dp/aggregation/copyDataset.py +++ b/ocean_dp/aggregation/copyDataset.py @@ -38,7 +38,10 @@ def aggregate(files, varNames): # look over all files, create a time array from all files # TODO: maybe delete files here without variables we're not interested in # TODO: Create set of variables in all files - + if not isinstance(varNames, list): + + varNames = [varNames] + filen = 0 for path_file in files: @@ -351,56 +354,56 @@ def aggregate(files, varNames): return outputName -def collect_vars_to_agg(files): +# def collect_vars_to_agg(files): - var_list = [] +# var_list = [] - nc = Dataset(files[0]) - varList = nc.variables +# nc = Dataset(files[0]) +# varList = nc.variables - # default to all variables in first file should no variable be specified - var_list.extend(varList.keys()) - var_list.remove("TIME") +# # default to all variables in first file should no variable be specified +# var_list.extend(varList.keys()) +# var_list.remove("TIME") - nc.close() +# nc.close() - print("collect_vars_to_agg::", var_list) +# print("collect_vars_to_agg::", var_list) - return var_list +# return var_list -if __name__ == "__main__": +# if __name__ == "__main__": - files = [] - varToAgg = None # defaults to all in first file +# files = [] +# varToAgg = None # defaults to all in first file - if len(sys.argv) > 1: - parser = argparse.ArgumentParser() - parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)') - parser.add_argument('-f', dest='filelist', help='read file names from file') - parser.add_argument('file', nargs='*', help='input file name') - args = parser.parse_args() +# if len(sys.argv) > 1: +# parser = argparse.ArgumentParser() +# parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)') +# parser.add_argument('-f', dest='filelist', help='read file names from file') +# parser.add_argument('file', nargs='*', help='input file name') +# args = parser.parse_args() - if not isinstance(args.filelist, type(None)): - with open(args.filelist, "r") as ins: - for line in ins: - print(line) - files.append(line.strip()) +# if not isinstance(args.filelist, type(None)): +# with open(args.filelist, "r") as ins: +# for line in ins: +# print(line) +# files.append(line.strip()) - if len(args.file): - # files = args.file - for fn in args.file: - files.extend(glob.glob(fn)) +# if len(args.file): +# # files = args.file +# for fn in args.file: +# files.extend(glob.glob(fn)) - varToAgg = args.var +# varToAgg = args.var - if isinstance(varToAgg, type(None)): - varToAgg = collect_vars_to_agg(files) +# if isinstance(varToAgg, type(None)): +# varToAgg = collect_vars_to_agg(files) - print("Aggregating variables ", varToAgg) +# print("Aggregating variables ", varToAgg) - outputName = aggregate(files, varToAgg) +# outputName = aggregate(files, varToAgg) - print("Output file : %s" % outputName) +# print("Output file : %s" % outputName) diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py index af248c2..0473144 100755 --- a/ocean_dp/processing/pressure_interpolator.py +++ b/ocean_dp/processing/pressure_interpolator.py @@ -26,6 +26,8 @@ def pressure_interpolator(netCDFfiles = [],agg = []): + files_out = [] + if netCDFfiles==[]: print('netcdffiles = none') @@ -65,6 +67,8 @@ def pressure_interpolator(netCDFfiles = [],agg = []): # a copy of the old file with the new name if fn_new != fn: + files_out.append(fn_new) + print('copying file') # copy file shutil.copy(fn, fn_new) @@ -254,7 +258,8 @@ def pressure_interpolator(netCDFfiles = [],agg = []): fv01_contents.close() agg.close() - + + return files_out diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py index e211cf4..a9e7d23 100755 --- a/ocean_dp/qc/flatline_test.py +++ b/ocean_dp/qc/flatline_test.py @@ -28,13 +28,13 @@ # If files aren't specified, take all the IMOS*.nc files in the current folder -def flatline_test_all_files(target_vars_in=[], window=3, flag=3): +def flatline_test_all_files(target_vars_in=[], window=3, flag=4): target_files = glob.glob('IMOS*.nc') flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag) -def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3): +def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4): # Loop through each files in target_files for current_file in target_files: @@ -48,7 +48,7 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=3): flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag) -def flatline_test(nc, target_vars_in=[], window=3, flag=3): +def flatline_test(nc, target_vars_in=[], window=3, flag=4): # If target_vars aren't user specified, set it to all the variables of # the current_file, removing unwanted variables @@ -75,6 +75,21 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3): # For each variable, extract the data for current_var in target_vars: + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_flt" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"] + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_flt" + var_data = np.array(nc.variables[current_var]) print('checking '+current_var) @@ -86,13 +101,17 @@ def flatline_test(nc, target_vars_in=[], window=3, flag=3): if len(set(var_data[i:(i+window)])) == 1: # set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i:(i+window)] = flag + nc.variables[current_var+'_quality_control_flt'][i:(i+window)] = flag + + nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_flt"][:],nc.variables[current_var + "_quality_control"][:]) # update the history attribute try: hist = nc.history + "\n" except AttributeError: hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) diff --git a/ocean_dp/qc/global_range.py b/ocean_dp/qc/global_range.py index e11bf40..68e2ee0 100644 --- a/ocean_dp/qc/global_range.py +++ b/ocean_dp/qc/global_range.py @@ -28,29 +28,71 @@ # flag 4 (bad) when out of global range -def global_range(netCDFfile, variable, max, min): +def global_range(netCDFfile, variable, max, min, qc_value=4): ds = Dataset(netCDFfile, 'a') - var = ds.variables[variable] + nc_var = ds.variables[variable] + var_data = nc_var[:] + var_data.mask = False try: - var_qc = ds.variables[variable + "_quality_control"] + # find the existing quality_control variable in the auxillary variables list + aux_vars = nc_var.ancillary_variables + aux_var = aux_vars.split(" ") + qc_vars = [i for i in aux_var if i.endswith("_quality_control")] + qc_var = qc_vars[0] + print("QC var name ", qc_var) + var_qc = ds.variables[qc_var] except KeyError: print("no QC variable found") return None + # read existing quality_control flags + qc = var_qc[:] + # this is where the actual QC test is done - mask = ((var[:] > max) | (var[:] < min)) + mask = ((var_data > max) | (var_data < min)) + print('mask data ', mask) + + # create a qc variable just for this test flags + if nc_var.name + "_quality_control_gr" in ds.variables: + ncVarOut = ds.variables[nc_var.name + "_quality_control_gr"] + else: + ncVarOut = ds.createVariable(nc_var.name + "_quality_control_gr", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_gr" + + # store the qc flags + ncVarOut[mask] = qc_value - mask = mask & (var_qc[:] < 1) # only mark data that has not been QCd already + # store qc flags to main quality_control flags variable + mask = mask & (qc < 1) # only mark data that has not been QCd already + print('mask other qc ', mask) - var_qc[mask] = 4 - count = sum(mask) - print('marked records ', count) + qc[mask] = qc_value # mark the out of range points with bad_data + + # calculate the number of points marked as bad_data + marked = np.zeros_like(qc) + marked[mask] = 1 + count = sum(marked) + print('marked records ', count, mask, qc) + + # write flags back to main QC variable + var_qc[:] = qc # update the history attribute - history = ds.history - ds.setncattr("history", history + "\n" + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count)) + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.setncattr("history", hist + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count)) + + ds.variables[variable + "_quality_control"][:] = np.maximum(ds.variables[variable + "_quality_control_gr"][:],ds.variables[variable + "_quality_control"][:]) ds.close() @@ -59,5 +101,8 @@ def global_range(netCDFfile, variable, max, min): if __name__ == "__main__": - # usage is - global_range(sys.argv[1], sys.argv[2], float(sys.argv[3]), float(sys.argv[4])) + # usage is + if len(sys.argv) > 5: + global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4]), qc_value=int(sys.argv[5])) + else: + global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4])) \ No newline at end of file diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py index 2543c23..e8635ed 100644 --- a/ocean_dp/qc/in_out_water.py +++ b/ocean_dp/qc/in_out_water.py @@ -27,49 +27,56 @@ # flag out of water as QC value 7 (not_deployed), with wise leave as 0 -def in_out_water(netCDFfile): +def in_out_water(netCDFfile, var_name=None): ds = Dataset(netCDFfile, 'a') - vars = ds.variables - + nc_vars = ds.variables to_add = [] - for v in vars: - #print (vars[v].dimensions) - if v != 'TIME': - to_add.append(v) - - time_var = vars["TIME"] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = nc_vars["TIME"] time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) - print(time_deploy) + print('deployment time', time_deploy) print(to_add) - for v in to_add: - if "TIME" in vars[v].dimensions: - if v.endswith("_quality_control"): + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + for v in to_add: + if "TIME" in nc_vars[v].dimensions: + if v.endswith("_quality_control"): print("QC time dim ", v) - ncVarOut = vars[v] - mask = (time <= time_deploy) | (time >= time_recovery) - ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7 - + ncVarOut = nc_vars[v] + ncVarOut[mask] = 7 + else: + # create a qc variable just for this test flags + if v + "_quality_control_io" in ds.variables: + ncVarOut = ds.variables[v + "_quality_control_io"] + else: + ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_vars[v].shape) + ncVarOut.long_name = "quality flag for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ncVarOut[mask] = 7 + + ds.variables[v + "_quality_control"][:] = np.maximum(ds.variables[v + "_quality_control_io"][:],ds.variables[v + "_quality_control"][:]) ds.file_version = "Level 1 - Quality Controlled Data" - - # update the history attribute - try: - hist = nc.history + "\n" - - except AttributeError: - hist = "" - - nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7') - ds.close() @@ -77,4 +84,7 @@ def in_out_water(netCDFfile): if __name__ == "__main__": - in_out_water(sys.argv[1]) + if len(sys.argv) > 2 & sys.argv[1].startswith('-'): + in_out_water(sys.argv[2], var_name=sys.argv[1][1:]) + else: + in_out_water(sys.argv[1]) \ No newline at end of file diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py index f4e187b..d889b1f 100755 --- a/ocean_dp/qc/rate_of_change_test.py +++ b/ocean_dp/qc/rate_of_change_test.py @@ -106,14 +106,31 @@ def roc_test(nc,*args,target_vars_in=[]): # For each variable for current_var in target_vars: - # Extract the data - var_data = np.array(nc.variables[current_var]) + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_roc" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + + # Extract the variable data + var_data = np.array(nc.variables[current_var][:]) # Calculate dvar/dtime var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) # For any change greater than change_per_hr, assign a qc value of 4 - nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + + nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_roc"][:],nc.variables[current_var + "_quality_control"][:]) print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour') @@ -146,6 +163,22 @@ def roc_test(nc,*args,target_vars_in=[]): # For each variable for current_var in target_vars: + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_roc" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + + # Extract the data var_data = np.array(nc.variables[current_var]) @@ -153,7 +186,9 @@ def roc_test(nc,*args,target_vars_in=[]): var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) # For any change greater than change_per_hr, assign a qc value of 4 - nc.variables[current_var+'_quality_control'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + + nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_roc"][:],nc.variables[current_var + "_quality_control"][:]) print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour') diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py index a061c32..622d3a3 100755 --- a/ocean_dp/qc/spike_test.py +++ b/ocean_dp/qc/spike_test.py @@ -26,15 +26,19 @@ import pytz import os +default_high = 100 + +default_low = 50 + # If files aren't specified, take all the IMOS*.nc files in the current folder -def spike_test_all_files(target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): +def spike_test_all_files(target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): target_files = glob.glob('IMOS*.nc') spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) -def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): +def spike_test_files(target_files, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): # Loop through each files in target_files for current_file in target_files: @@ -48,7 +52,7 @@ def spike_test_files(target_files, target_vars_in=[], thresh_low=10, thresh_high spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) -def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, flag_high=4): +def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): # If target_vars aren't user specified, set it to all the variables of # the current_file, removing unwanted variables @@ -72,12 +76,27 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, else: target_vars = target_vars_in - # For each variable, extract the data + # For each variable for current_var in target_vars: - var_data = np.array(nc.variables[current_var]) - - + # Extract the variable + nc_var = nc.variables[current_var] + + # Create a test specific qc variable if it doesn't already exist + if nc_var.name + "_quality_control_spk" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_spk"] + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_spk", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_spk" + + # Extract the variable data + var_data = np.array(nc.variables[current_var][:]) print('checking '+current_var+' for high spikes') @@ -96,20 +115,12 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, print('High spike found') #set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_high + nc.variables[current_var+'_quality_control_spk'][i] = flag_high - # # Extract the qc data - # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) - - # # Find all the instances of consecutive 4s, and reset them to 0 - # for i in np.where(current_qc==4)[0][0:-1][np.diff(np.where(current_qc==4)[0])==1]: - - # nc.variables[current_var+'_quality_control'][i:i+2] = 0 # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1] - - #print(low_spike_chk_idx) + # Remove from the indices those that are either side of a high spike for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]: @@ -133,28 +144,15 @@ def spike_test(nc, target_vars_in=[], thresh_low=10, thresh_high=20, flag_low=3, # Calculate the step changes shoulder_diff = np.diff(var_data[i-1:i+2]) - #print('shoulder mean is '+str(shoulder_mean)) - - #abs_diff = abs(var_data[i]-shoulder_mean) - - #print('absolute difference is '+str(abs_diff)) - # Check for spike exceeding low threshold if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)): #& (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])): print('Low spike found') #set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_low - - # # Extract the qc data - # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) - - # # Find all the instances of consecutive 3s, and reset them to 0 - # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]: - - # nc.variables[current_var+'_quality_control'][i:i+2] = 0 - + nc.variables[current_var+'_quality_control_spk'][i] = flag_low + + nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_spk"][:],nc.variables[current_var + "_quality_control"][:]) # update the history attribute try: diff --git a/ocean_dp/qc/spike_test_ver_2.py b/ocean_dp/qc/spike_test_ver_2.py deleted file mode 100755 index 68fe69e..0000000 --- a/ocean_dp/qc/spike_test_ver_2.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (C) 2020 Ben Weeding -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import re -from datetime import datetime, timedelta -from netCDF4 import num2date, date2num -from netCDF4 import stringtochar -import numpy.ma as ma -import sys -from netCDF4 import Dataset -import numpy as np -import argparse -import glob -import pytz -import os - -# If files aren't specified, take all the IMOS*.nc files in the current folder -def spike_test_all_files(target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): - target_files = glob.glob('IMOS*.nc') - - spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) - - -def spike_test_files(target_files, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): - - # Loop through each files in target_files - for current_file in target_files: - # Print each filename - print("input file %s" % current_file) - - # Extract netcdf data into nc - nc = Dataset(current_file, mode="a") - - # run the spike test - spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) - - - -def spike_test(nc, target_vars_in=[], thresh_low=2, thresh_high=4, flag_low=3, flag_high=4): - - # If target_vars aren't user specified, set it to all the variables of - # the current_file, removing unwanted variables - if target_vars_in == []: - - target_vars = list(nc.variables.keys()) - - # Remove TIME - target_vars.remove('TIME') - - # Remove any quality_control variables - qc_vars = [s for s in target_vars if 'quality_control' in s] - target_vars = [s for s in target_vars if s not in qc_vars] - - # Remove any variables of single length - single_vars = [s for s in target_vars if nc.variables[s].size==1] - target_vars = [s for s in target_vars if s not in single_vars] - - print('target_vars are '+' '.join(target_vars)) - - else: - target_vars = target_vars_in - - # For each variable, extract the data - for current_var in target_vars: - - var_data = np.array(nc.variables[current_var]) - - print('checking '+current_var+' for high spikes') - - # Step through the data, one element at a time, starting from the 2nd element - for i in range(1,(len(var_data)-1)): - - # Calculate the mean of the i-1 and i+1 elements - shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) - - # Calculate the step changes - shoulder_diff = np.diff(var_data[i-1:i+2]) - - # Check for spike exceeding high threshold - if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)): - - print('High spike found') - - #set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_high - - print('checking '+current_var+' for low spikes') - - # For each of the remaining indices - for i in low_spike_chk_idx: - - #print('i is '+str(i)) - - # Calculate the mean of the i-1 and i+1 elements - shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) - - # Calculate the step changes - shoulder_diff = np.diff(var_data[i-1:i+2]) - - #print('shoulder mean is '+str(shoulder_mean)) - - abs_diff = abs(var_data[i]-shoulder_mean) - - #print('absolute difference is '+str(abs_diff)) - - # Check for spike exceeding low threshold - if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>=0)) & (False in (shoulder_diff>=0)): - - print('Low spike found') - - #set corresponding QC value to... - nc.variables[current_var+'_quality_control'][i] = flag_low - - # # Extract the qc data - # current_qc = np.array(nc.variables[current_var+'_quality_control'][:]) - - # # Find all the instances of consecutive 3s, and reset them to 0 - # for i in np.where(current_qc==3)[0][0:-1][np.diff(np.where(current_qc==3)[0])==1]: - - # nc.variables[current_var+'_quality_control'][i:i+2] = 0 - - - # update the history attribute - try: - hist = nc.history + "\n" - except AttributeError: - hist = "" - - nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) - - nc.close() - -if __name__ == "__main__": - # usage is - spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6])) - - \ No newline at end of file diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py new file mode 100755 index 0000000..491f3a5 --- /dev/null +++ b/ocean_dp/sots_processing_runthrough.py @@ -0,0 +1,74 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Initial package import +import sys +import glob +import fnmatch +import os + +# Addition of folder containing user defined packages/modules +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/qc') +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/aggregation') +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/processing') + +# Import user defined packages +import add_qc_flags +import in_out_water +import copyDataset +import pressure_interpolator + +import global_range +import rate_of_change_test + +# Set the working directory +os.chdir('/Users/tru050/Desktop/sofs7.5 test data') + +# Make a list of FV00 filenames +fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc') + +# Run add_qc_flags.py and collect FV01 filenames +fv01_files = add_qc_flags.add_qc(fv00_files) + +# Run in_out_water.py +for ncfile in fv01_files: + + in_out_water.in_out_water(ncfile,var_name='TEMP') + +# Select pressure files using matching = fnmatch.filter(sofs75filesfv01,'*SOTS*P*_2*.nc') +pres_files = fnmatch.filter(fv01_files,'*IMOS_ABOS-SOTS*P*_2*FV01*.nc') + +# Run copyDataset.py +copyDataset.aggregate(pres_files,'PRES') + +# Run pressure_interpolator.py +fv01_pres_interp_files = pressure_interpolator.pressure_interpolator(netCDFfiles=fv01_files,agg=glob.glob('*IMOS_ABOS-SOTS*Aggregate*.nc')[0]) + +# Global range test +for ncfile in fv01_pres_interp_files: + + print(ncfile) + + global_range.global_range(ncfile,'TEMP',40,-2) + +# Rate of change +rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20) + +# Spike + + +# Flatline + + From 07b40fdd30982cd558448c4adec61e3cc8f424aa Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 22 Apr 2020 14:03:52 +1000 Subject: [PATCH 36/59] Qc checker and variable selector --- ocean_dp/qc/qc_checker.py | 116 +++++++++++++++++++++++++ ocean_dp/qc/rate_of_change_test.py | 13 +-- ocean_dp/sots_processing_runthrough.py | 17 +++- 3 files changed, 134 insertions(+), 12 deletions(-) create mode 100755 ocean_dp/qc/qc_checker.py diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py new file mode 100755 index 0000000..5d8720d --- /dev/null +++ b/ocean_dp/qc/qc_checker.py @@ -0,0 +1,116 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +# Code is design to check that netcdf files processed using the SOTS methods +# conform to the QC labelling designed by Peter Jansen. + +def qc_checker_all_files(target_vars_in=[]): + + successful_files=[] + + target_files = glob.glob('IMOS*.nc') + + successful_files = qc_checker_files(target_files, target_vars_in=target_vars_in) + + +def qc_checker_files(target_files,target_vars_in=[]): + + successful_files=[] + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items + if qc_checker(nc,target_vars_in=target_vars_in): + + successful_files.append(current_file) + + return successful_files + + +# Enter args as variable name and rate of change limit, ie. 'TEMP',4 +def qc_checker(nc,target_vars_in=[]): + + all_vars = list(nc.variables.keys()) + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables (qc, single length, TIME) + if target_vars_in == []: + + target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + qc_behaving = True + + for current_var in target_vars: + + qc_global_data = np.array(nc.variables[current_var+"_quality_control"][:]) + + qc_test_specific = [s for s in all_vars if current_var+"_quality_control" in s and not s.endswith('control')] + + for current_qc_test in qc_test_specific: + + qc_test_data = np.array(nc.variables[current_qc_test]) + + #print('checking '+current_qc_test) + + # If true, fail process + if any(np.less(qc_global_data,qc_test_data)): + + print(current_qc_test + "failed") + + qc_behaving = False + + if qc_behaving: + + return True + + + # Close the current netcdf file + nc.close() + + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py index d889b1f..a7f54d1 100755 --- a/ocean_dp/qc/rate_of_change_test.py +++ b/ocean_dp/qc/rate_of_change_test.py @@ -79,18 +79,9 @@ def roc_test(nc,*args,target_vars_in=[]): # the current_file, removing unwanted variables if target_vars_in == []: - target_vars = list(nc.variables.keys()) + all_vars = list(nc.variables.keys()) - # Remove TIME - target_vars.remove('TIME') - - # Remove any quality_control variables - qc_vars = [s for s in target_vars if 'quality_control' in s] - target_vars = [s for s in target_vars if s not in qc_vars] - - # Remove any variables of single length - single_vars = [s for s in target_vars if nc.variables[s].size==1] - target_vars = [s for s in target_vars if s not in single_vars] + target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1] print('target_vars are '+' '.join(target_vars)) diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index 491f3a5..474eb3a 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -18,6 +18,7 @@ import glob import fnmatch import os +import time # Addition of folder containing user defined packages/modules sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/qc') @@ -32,6 +33,11 @@ import global_range import rate_of_change_test +import spike_test +import flatline_test +import qc_checker + +start = time.time() # Set the working directory os.chdir('/Users/tru050/Desktop/sofs7.5 test data') @@ -67,8 +73,17 @@ rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20) # Spike - +spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP']) # Flatline +flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP']) + +# Check qc process has worked +fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP']) + +end = time.time() + +print('time elapsed: '+end-start) + From a6bb5603206ca99784be304a4e20d676dd64e7fc Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 22 Apr 2020 14:23:55 +1000 Subject: [PATCH 37/59] commented qc checker --- ocean_dp/qc/qc_checker.py | 28 +++++++++++++++++--------- ocean_dp/sots_processing_runthrough.py | 2 +- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py index 5d8720d..e229d72 100755 --- a/ocean_dp/qc/qc_checker.py +++ b/ocean_dp/qc/qc_checker.py @@ -36,6 +36,7 @@ def qc_checker_all_files(target_vars_in=[]): target_files = glob.glob('IMOS*.nc') + # Returns the files that conform to the qc labelling successful_files = qc_checker_files(target_files, target_vars_in=target_vars_in) @@ -51,17 +52,19 @@ def qc_checker_files(target_files,target_vars_in=[]): # Extract netcdf data into nc nc = Dataset(current_file, mode="a") - # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items + # If the qc_checker was successfull, add the filename to the list if qc_checker(nc,target_vars_in=target_vars_in): successful_files.append(current_file) - + + # Returns the files that conform to the qc labelling return successful_files # Enter args as variable name and rate of change limit, ie. 'TEMP',4 def qc_checker(nc,target_vars_in=[]): + # Collect all the variables from the netcdf all_vars = list(nc.variables.keys()) # If target_vars aren't user specified, set it to all the variables of @@ -74,31 +77,38 @@ def qc_checker(nc,target_vars_in=[]): else: target_vars = target_vars_in - - qc_behaving = True - + + # For each of the variables selected for current_var in target_vars: + # Collect the global qc data qc_global_data = np.array(nc.variables[current_var+"_quality_control"][:]) + # Collect the names of all the other test specific qc vectors qc_test_specific = [s for s in all_vars if current_var+"_quality_control" in s and not s.endswith('control')] + # For each of the other test specific qc vectors for current_qc_test in qc_test_specific: + # Extract the data qc_test_data = np.array(nc.variables[current_qc_test]) #print('checking '+current_qc_test) - # If true, fail process + # If any of the test specific qc vectors ever have a great value than the global qc vector at a timestamp, the qc process has failed if any(np.less(qc_global_data,qc_test_data)): print(current_qc_test + "failed") qc_behaving = False - if qc_behaving: - - return True + else: + + # The qc process has succeeded + qc_behaving = True + + # Returns true if qc has succeeded, false if not + return qc_behaving # Close the current netcdf file diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index 474eb3a..6c8bd90 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -83,7 +83,7 @@ end = time.time() -print('time elapsed: '+end-start) +print('time elapsed: '+str(end-start)) From f97718bcd8264cf0450c8bd19d8cfe85c5b7af74 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 27 Apr 2020 17:00:20 +1000 Subject: [PATCH 38/59] Changed tests to update netcdfs correctly when reprocessing data --- ocean_dp/plotting/plotNetCDFmultiqc.py | 340 +++++++++++++++++++++++++ ocean_dp/qc/flatline_test.py | 145 ++++++----- ocean_dp/qc/global_range.py | 3 +- ocean_dp/qc/qc_checker.py | 6 +- ocean_dp/qc/rate_of_change_test.py | 57 ++++- ocean_dp/qc/spike_test.py | 8 +- ocean_dp/sots_processing_runthrough.py | 6 +- 7 files changed, 479 insertions(+), 86 deletions(-) create mode 100755 ocean_dp/plotting/plotNetCDFmultiqc.py diff --git a/ocean_dp/plotting/plotNetCDFmultiqc.py b/ocean_dp/plotting/plotNetCDFmultiqc.py new file mode 100755 index 0000000..9442a56 --- /dev/null +++ b/ocean_dp/plotting/plotNetCDFmultiqc.py @@ -0,0 +1,340 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset +from netCDF4 import num2date +import datetime as dt +import numpy as np +import matplotlib + +matplotlib.use('Agg') + +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import sys +import os +from matplotlib import rc + +# rc('text', usetex=True) + +for path_file in sys.argv[1:len(sys.argv)]: + + nc = Dataset(path_file) + + # get time variable + nctime = nc.variables['TIME'][:] + t_unit = nc.variables['TIME'].units # get unit "days since 1950-01-01T00:00:00Z" + + try: + t_cal = nc.variables['TIME'].calendar + except AttributeError: # Attribute doesn't exist + t_cal = u"gregorian" # or standard + + dt_time = [num2date(t, units=t_unit, calendar=t_cal) for t in nctime] + + # work out variables to plot + nc_vars_to_plot = [var for var in nc.variables] + + # remove any dimensions from the list to plot + nc_dims = [dim for dim in nc.dimensions] # list of nc dimensions + + for i in nc_dims: + try: + nc_vars_to_plot.remove(i) + except ValueError: + print('did not remove ', i) + + # remove an auxiliary variables from the list to plot + aux_vars = list() + for var in nc.variables: + try: + aux_vars.append(nc.variables[var].getncattr('ancillary_variables')) + except AttributeError: + pass + + # remove any variables without a TIME dimension from the list to plot + to_plot = list() + + for var in nc.variables: + # print var + if var in nc_dims: + continue + if var in aux_vars: + continue + if 'TIME' in nc.variables[var].dimensions: + print('to plot ', var) + to_plot.append(var) + + # pdffile = path_file[path_file.rfind('/')+1:len(path_file)] + '-' + nc.getncattr('deployment_code') + '-plot.pdf' + + pdffile = path_file + '.pdf' + + pp = PdfPages(pdffile) + + txt = "" + lines = 0 + plt.figure(figsize=(11.69, 8.27)) + + txt = 'file name : ' + os.path.basename(path_file) + '\n\n' + + txt += 'Dimensions:\n' + for x in nc.dimensions: + txt += ' ' + x + ' (' + str(nc.dimensions[x].size) + ')\n' + + txt += '\nVariables:\n' + for x in nc.variables: + v_atts = nc.variables[x] + var_line = ' ' + x + ' ' + str(v_atts.dimensions) + + try: + var_line += ' : long_name = ' + v_atts.long_name + except AttributeError: + pass + try: + var_line += ' (' + v_atts.units + ')' + except AttributeError: + pass + var_line += ' : type ' + str(v_atts.datatype) + + print(var_line) + + lines = txt.count('\n') + var_line.count('\n') + # print("lines ", lines) + if lines > 57: + #print(txt) + print('new page') + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + plt.figure(figsize=(11.69, 8.27)) + + txt = "" + + lines = 0 + + txt += var_line + '\n' + + plt.figure(figsize=(11.69, 8.27)) + + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + + txt = "" + plt.figure(figsize=(11.69, 8.27)) + + lines = 0 + # print "NetCDF Global Attributes:" + for nc_attr in sorted(nc.ncattrs(), key=lambda s: s.lower()): + #print('\t%s:' % nc_attr, repr(nc.getncattr(nc_attr))) + attrib_txt = nc_attr + ' : ' + str(nc.getncattr(nc_attr)).replace('\n', '\n ') + '\n' + lines = txt.count('\n') + attrib_txt.count('\n') + # print("lines ", lines) + if lines > 57: + #print(txt) + print('new page') + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + plt.figure(figsize=(11.69, 8.27)) + + txt = "" + + lines = 0 + + txt += attrib_txt + + lines += 1 + + #print(txt) + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + + # plot each variable in the to_plot list + for plot in to_plot: + + plot_var = nc.variables[plot] + + var = plot_var[:] + shape_len = len(var.shape) + + # create a page with information about the variable, and any aux variables + fig = plt.figure(figsize=(11.69, 8.27)) + + text = "Variable : " + plot_var.name + str(plot_var.dimensions) + "\n" + nc_attrs = plot_var.ncattrs() + # print "NetCDF Variable Attributes:" + for nc_attr in nc_attrs: + attrVal = plot_var.getncattr(nc_attr) + #print('\t%s:' % nc_attr, repr(plot_var.getncattr(nc_attr)), type(attrVal)) + text += nc_attr + ' : ' + str(attrVal) + '\n' + + if hasattr(plot_var, 'ancillary_variables'): + qc_var_name = plot_var.getncattr('ancillary_variables') + qc_var = nc.variables[qc_var_name] + + text += "\nAUX : " + qc_var.name + str(qc_var.dimensions) + "\n" + + nc_attrs = qc_var.ncattrs() + # print "NetCDF AUX Variable Attributes:" + for nc_attr in nc_attrs: + # print '\t%s:' % nc_attr, repr(nc.getncattr(nc_attr)) + text += nc_attr + ' : ' + str(qc_var.getncattr(nc_attr)) + '\n' + + qc = nc.variables[qc_var_name][:] + + if plot_var.dimensions[0] != 'TIME': + qc = np.transpose(qc) + + qc = np.squeeze(qc) + else: + qc = 0 + + plt.text(-0.1, 0.0, text, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig(fig) + plt.close(fig) + + print(plot_var.name, " shape ", var.shape, " len ", shape_len) + + # now create a page with the plot + + fig = plt.figure(figsize=(11.69, 8.27)) + ax = plt.subplot(111) + + if plot_var.dimensions[0] != 'TIME': + var = np.transpose(var) + var = np.squeeze(var) + + # create range from only good data + qc_m = np.ma.masked_where((qc == 9) | (qc == 4) | (qc == 6), var) + mx = qc_m.max() + mi = qc_m.min() + + marg = (mx - mi) * 0.1 + print("max ", mx, " min ", mi) + + plt.ylim([mi - marg, mx + marg]) + + # create a legend entry made from serial_number and depth + if hasattr(plot_var, 'sensor_serial_number'): + sn = plot_var.getncattr('sensor_serial_number').split('; ') + elif hasattr(plot_var, 'sensor_serial_number'): + sn = nc.getncattr('instrument_serial_number').split('; ') + else: + sn = 'not found' + + if hasattr(plot_var, 'sensor_depth'): + dpth = plot_var.getncattr('sensor_depth').split('; ') + elif hasattr(plot_var, 'sensor_height'): + dpth = plot_var.getncattr('sensor_height').split('; ') + elif hasattr(nc, 'instrument_nominal_depth'): + dpth = str(nc.getncattr('instrument_nominal_depth')).split('; ') + else: + dpth = 'unknown' + + print("depth ", dpth) + + leg = [x + ' (' + y + ' m)' for x, y in zip(sn, dpth)] + + # if less than 200 points plot with a dot and line + plot_marks = '-' + if len(dt_time) < 200: + plot_marks = '.-' + + pl = ax.plot(dt_time, qc_m, plot_marks) + + # mark qc>2 with yellow dot, qc>3 with red dot + qc_m = np.ma.masked_where((qc <= 2) | (qc == 8), var) + ax.plot(dt_time, qc_m, 'yo') + qc_m = np.ma.masked_where((qc <= 3) | (qc == 8), var) + ax.plot(dt_time, qc_m, 'ro') + + # shrink the plot some + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) + + # add legend below plot + #plt.legend(iter(pl), leg, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=5) + + plt.legend(iter(pl), leg, bbox_to_anchor=(0.0, -0.2, 1.0, -0.15), loc=3, ncol=6, mode="expand", borderaxespad=0.0, fontsize='x-small') + + # invert the yaxis if the units are dbar + try: + if plot_var.units == 'dbar': + plt.gca().invert_yaxis() + except AttributeError: + pass + try: + if plot_var.positive == 'down': + plt.gca().invert_yaxis() + except AttributeError: + pass + + #fig.autofmt_xdate() + plt.grid() + + # add deployment/instrument/standard name as title + + # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.sensor_name + ' ' + \ + # plot_var.sensor_serial_number + ' : ' + plot_var.name, fontsize=10) + + # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.getncattr('name'), fontsize=10) + try: + plt.title(nc.getncattr('deployment_code'), fontsize=10) + except AttributeError: + pass + + # add units to Y axis + try: + plt.ylabel(plot + ' (' + plot_var.units + ')') + except AttributeError: + pass + + date_time_start = None + date_time_end = None + + # plot only the time of deployment + try: + date_time_start = dt.datetime.strptime(nc.getncattr('time_coverage_start'), '%Y-%m-%dT%H:%M:%SZ') + date_time_end = dt.datetime.strptime(nc.getncattr('time_coverage_end'), '%Y-%m-%dT%H:%M:%SZ') + except AttributeError: + pass + try: + date_time_start = dt.datetime.strptime(nc.getncattr('time_deployment_start'), '%Y-%m-%dT%H:%M:%SZ') + date_time_end = dt.datetime.strptime(nc.getncattr('time_deployment_end'), '%Y-%m-%dT%H:%M:%SZ') + except AttributeError: + pass + + if date_time_start: + plt.xlim(date_time_start, date_time_end) + + # plt.savefig(plot + '.pdf') + pp.savefig(fig, papertype='a4') + plt.close(fig) + + # plt.show() + + pp.close() + + nc.close() diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py index a9e7d23..79ab981 100755 --- a/ocean_dp/qc/flatline_test.py +++ b/ocean_dp/qc/flatline_test.py @@ -28,13 +28,13 @@ # If files aren't specified, take all the IMOS*.nc files in the current folder -def flatline_test_all_files(target_vars_in=[], window=3, flag=4): +def flatline_test_all_files(target_vars_in=[], window=5, flag=4): target_files = glob.glob('IMOS*.nc') flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag) -def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4): +def flatline_test_files(target_files, target_vars_in=[], window=5, flag=4): # Loop through each files in target_files for current_file in target_files: @@ -48,74 +48,87 @@ def flatline_test_files(target_files, target_vars_in=[], window=3, flag=4): flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag) -def flatline_test(nc, target_vars_in=[], window=3, flag=4): +def flatline_test(nc, target_vars_in=[], window=5, flag=4): - # If target_vars aren't user specified, set it to all the variables of - # the current_file, removing unwanted variables - if target_vars_in == []: - - target_vars = list(nc.variables.keys()) - - # Remove TIME - target_vars.remove('TIME') - - # Remove any quality_control variables - qc_vars = [s for s in target_vars if 'quality_control' in s] - target_vars = [s for s in target_vars if s not in qc_vars] - - # Remove any variables of single length - single_vars = [s for s in target_vars if nc.variables[s].size==1] - target_vars = [s for s in target_vars if s not in single_vars] - - print('target_vars are '+' '.join(target_vars)) - - else: - target_vars = target_vars_in - - # For each variable, extract the data - for current_var in target_vars: - - # Extract the variable - nc_var = nc.variables[current_var] - - if nc_var.name + "_quality_control_flt" in nc.variables: - ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"] - else: - ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - ncVarOut[:] = np.zeros(nc_var.shape) - ncVarOut.long_name = "quality flag for " + nc_var.name - ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) - ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + print('Window is '+str(window)) + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + else: + target_vars = target_vars_in + + # For each variable, extract the data + for current_var in target_vars: + + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_flt" in nc.variables: + print('flt qc variable already present') + ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"] + ncVarOut[:] = 0 + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = 0 + # print(all(nc.variables[nc_var.name + "_quality_control_flt"]==0)) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + # add new variable to list of aux variables nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_flt" - - var_data = np.array(nc.variables[current_var]) - - print('checking '+current_var) - - # Step through the data, one element at a time, using the window - for i in range(0,(len(var_data)-window+1)): - - # This is true if 'window' elements in a row are equal - if len(set(var_data[i:(i+window)])) == 1: - - # set corresponding QC value to... - nc.variables[current_var+'_quality_control_flt'][i:(i+window)] = flag - - nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_flt"][:],nc.variables[current_var + "_quality_control"][:]) - - # update the history attribute - try: - hist = nc.history + "\n" - except AttributeError: - hist = "" - - - - nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) - - nc.close() + + var_data = np.array(nc.variables[current_var][:]) + + if (all(nc.variables[nc_var.name + "_quality_control_flt"][:] == 0)): + print('All test specific qc values are zero before filling') + + print('checking ' + current_var) + + print('Window is ' + str(window)) + + # Step through the data, one element at a time, using the window + for i in range(0, (len(var_data) - window + 1)): + + # This is true if 'window' elements in a row are equal + if len(set(var_data[i:(i + window)])) == 1: + print(str(i)) + # set corresponding QC value to... + ncVarOut[i:(i + window)] = flag + + points_marked = len([elem for elem in ncVarOut[:] if elem == 4]) + print('Data points flagged: ', points_marked) + + qc_var = nc.variables[current_var + "_quality_control"] + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) + + nc.close() if __name__ == "__main__": # usage is diff --git a/ocean_dp/qc/global_range.py b/ocean_dp/qc/global_range.py index 68e2ee0..db3b80b 100644 --- a/ocean_dp/qc/global_range.py +++ b/ocean_dp/qc/global_range.py @@ -57,9 +57,10 @@ def global_range(netCDFfile, variable, max, min, qc_value=4): # create a qc variable just for this test flags if nc_var.name + "_quality_control_gr" in ds.variables: ncVarOut = ds.variables[nc_var.name + "_quality_control_gr"] + ncVarOut[:] = 0 else: ncVarOut = ds.createVariable(nc_var.name + "_quality_control_gr", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut[:] = 0 ncVarOut.long_name = "quality flag for " + nc_var.name ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py index e229d72..90e83e8 100755 --- a/ocean_dp/qc/qc_checker.py +++ b/ocean_dp/qc/qc_checker.py @@ -115,7 +115,11 @@ def qc_checker(nc,target_vars_in=[]): nc.close() - +# def qc_check_plot(target_file,target_var) + +# nc = Dataset(target_file,'r') + + diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py index a7f54d1..84f8e4f 100755 --- a/ocean_dp/qc/rate_of_change_test.py +++ b/ocean_dp/qc/rate_of_change_test.py @@ -102,26 +102,38 @@ def roc_test(nc,*args,target_vars_in=[]): if nc_var.name + "_quality_control_roc" in nc.variables: ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape) else: ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut[:] = 0 ncVarOut.long_name = "quality flag for " + nc_var.name ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' - # add new variable to list of aux variables - nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" # Extract the variable data var_data = np.array(nc.variables[current_var][:]) + print('Not equal to zero test type 1') + print(str(np.where(ncVarOut[:]!=0))) + + if (all(ncVarOut[:] == 0)): + print('All test specific qc values are zero before filling') + # Calculate dvar/dtime var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) # For any change greater than change_per_hr, assign a qc value of 4 - nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + + # Extract global qc + qc_var = nc.variables[current_var + "_quality_control"] + + # Overwrite global qc with any higher values from test specific qc + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) - nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_roc"][:],nc.variables[current_var + "_quality_control"][:]) print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour') @@ -159,27 +171,50 @@ def roc_test(nc,*args,target_vars_in=[]): if nc_var.name + "_quality_control_roc" in nc.variables: ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape) else: ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut[:] = 0 ncVarOut.long_name = "quality flag for " + nc_var.name ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' - # add new variable to list of aux variables - nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" - + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + # Extract the data var_data = np.array(nc.variables[current_var]) + print('Not equal to zero test type 2') + print(str(np.where(ncVarOut[:]!=0))) + + if (all(ncVarOut[:] == 0)): + print('All test specific qc values are zero before filling') + # Calculate dvar/dtime var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) # For any change greater than change_per_hr, assign a qc value of 4 - nc.variables[current_var+'_quality_control_roc'][[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + + print('ncVarOut 4s') + print(str(np.where(ncVarOut[:]==4))) + + print('Netcdf variable 4s') + print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4))) + + nc.variables[nc_var.name + "_quality_control_roc"][:] = ncVarOut[:] + + print('Netcdf variable 4s after assignment') + print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4))) + + + # Extract global qc + qc_var = nc.variables[current_var + "_quality_control"] - nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_roc"][:],nc.variables[current_var + "_quality_control"][:]) + # Overwrite global qc with any higher values from test specific qc + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour') diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py index 622d3a3..089cf63 100755 --- a/ocean_dp/qc/spike_test.py +++ b/ocean_dp/qc/spike_test.py @@ -115,15 +115,15 @@ def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=defaul print('High spike found') #set corresponding QC value to... - nc.variables[current_var+'_quality_control_spk'][i] = flag_high + ncVarOut[i] = flag_high # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike - low_spike_chk_idx = np.where(nc.variables[current_var+'_quality_control'][:]!=4)[0][0:-1] + low_spike_chk_idx = np.where(ncVarOut[:]!=4)[0][0:-1] # Remove from the indices those that are either side of a high spike - for i in np.where(nc.variables[current_var+'_quality_control'][:]==4)[0]: + for i in ncVarOut[:]==4: low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i-1]] @@ -150,7 +150,7 @@ def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=defaul print('Low spike found') #set corresponding QC value to... - nc.variables[current_var+'_quality_control_spk'][i] = flag_low + ncVarOut[i] = flag_low nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_spk"][:],nc.variables[current_var + "_quality_control"][:]) diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index 6c8bd90..75fe6f4 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -40,7 +40,7 @@ start = time.time() # Set the working directory -os.chdir('/Users/tru050/Desktop/sofs7.5 test data') +os.chdir('/Users/tru050/Desktop/sofs6 test data') # Make a list of FV00 filenames fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc') @@ -70,13 +70,13 @@ global_range.global_range(ncfile,'TEMP',40,-2) # Rate of change -rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',20) +rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',10) # Spike spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP']) # Flatline -flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP']) +flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=10) # Check qc process has worked fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP']) From 43dd011c0a4b308aecc79c344b47a96c75414153 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 19 May 2020 17:17:28 +1000 Subject: [PATCH 39/59] updating petes plotQC, building temp_stat_plot --- ocean_dp/plotting/plotQC.py | 88 ++++++++++++++++++++ ocean_dp/qc/temp_diff_hist_extra.py | 16 ++++ ocean_dp/qc/temp_diff_histograms.py | 4 +- ocean_dp/qc/temp_stat_plot.py | 107 +++++++++++++++++++++++++ ocean_dp/sots_processing_runthrough.py | 2 +- 5 files changed, 214 insertions(+), 3 deletions(-) create mode 100755 ocean_dp/plotting/plotQC.py create mode 100755 ocean_dp/qc/temp_stat_plot.py diff --git a/ocean_dp/plotting/plotQC.py b/ocean_dp/plotting/plotQC.py new file mode 100755 index 0000000..3060759 --- /dev/null +++ b/ocean_dp/plotting/plotQC.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import xarray as xr + +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import seaborn as sns + +from pandas.plotting import register_matplotlib_converters + +import sys +import os + + +def plot_all(files): + register_matplotlib_converters() + plt.style.use('seaborn-darkgrid') + sns.set_context("paper") + + pp = PdfPages(os.path.join(os.path.dirname(files[0]), "batch-qc.pdf")) + + for f in files: + print("file ", f) + do_plot(f) + + pp.savefig() + plt.close() + + pp.close() + + +def plot(fn): + register_matplotlib_converters() + plt.style.use('seaborn-darkgrid') + sns.set_context("paper") + + pp = PdfPages(fn + "-qc.pdf") + + do_plot(fn) + + pp.savefig() + pp.close() + plt.close() + + +def do_plot(fn): + + #fn = sys.argv[1] + #fn = '/Users/pete/cloudstor/SOTS-Temp-Raw-Data/SOFS-7.5-2018/netCDF/IMOS_ABOS-SOTS_TIP_20180801_SOFS_FV01_SOFS-7.5-2018-Starmon-mini-4047-40m_END-20190331_C-20200429.nc' + + DS = xr.open_dataset(fn) + + ax1 = plt.subplot(2, 1, 1) + plt.plot(DS.TIME, DS.PAR) + plt.title(DS.deployment_code + " - " + DS.instrument_model + ":" + DS.instrument_serial_number + " @ " + str(DS.instrument_nominal_depth), {'fontsize': 8}) + + ax2 = plt.subplot(2, 1, 2, sharex=ax1) + aux = DS.PAR.ancillary_variables + a_vars = aux.split(" ") + for f in sorted(set(a_vars)): + print('aux var', f) + varn = f.split("_") + plt.plot(DS.TIME, DS.variables[f], label=varn[-1]) + plt.ylim(0, 9) + + plt.legend(prop={'size': 6}) + + DS.close() + + +if __name__ == "__main__": + plot_all(sys.argv[1:]) \ No newline at end of file diff --git a/ocean_dp/qc/temp_diff_hist_extra.py b/ocean_dp/qc/temp_diff_hist_extra.py index d8b6019..14dc6cf 100755 --- a/ocean_dp/qc/temp_diff_hist_extra.py +++ b/ocean_dp/qc/temp_diff_hist_extra.py @@ -13,6 +13,22 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob +from netCDF4 import num2date +from dateutil import parser +import datetime + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): for fname in files: diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py index 2003afc..a3f8420 100755 --- a/ocean_dp/qc/temp_diff_histograms.py +++ b/ocean_dp/qc/temp_diff_histograms.py @@ -130,7 +130,7 @@ def last_four(entry): deployments.sort(key=last_four) - +####### @@ -207,7 +207,7 @@ def last_four(entry): for plt_idx,dep_name in zip(range(0,len(deployments)),deployments): - print('plotting '+ str(len(all_deployment_dtemp_dtime[plt_idx])) + ' values') + print('plotting '+ str((all_deployment_dtemp_dtime[plt_idx])) + ' values') hist_data = ax[plt_idx].hist(all_deployment_dtemp_dtime[plt_idx],21,log=True) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py new file mode 100755 index 0000000..752ebe5 --- /dev/null +++ b/ocean_dp/qc/temp_stat_plot.py @@ -0,0 +1,107 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +# creates an empty array to store the names of the SOTS deployments +deployments = [] + +# loops through all the folders and files contained in the folder +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.' not in x): + + deployments.append(x) + +# create a dataframe to store extract information +temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"]) + +# loops through all files in the directory +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # for each netcdf file labelled as FV01 + if fname.endswith('.nc') and 'FV01' in fname: + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'r') + + # check that the in_out_water test has been run on the file + if 'TEMP_quality_control_io' in list(nc.variables): + + # check that the file has a single dimension temperature vector, and that the time format is correct + if np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # calculate temperature changes for in water data + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes in hours + nc_time_hr_diffs = np.diff(nc_time_hr) + + # calculate the rate of change of temperature wrt time (degrees °C per hour) + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + + + # extract temp_qc data + nc_temp_qc = np.array(nc.variables['TEMP_quality_control'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # calculate qc values for each nc_dtemp_dtime by taking the maximum of the qc values of the two contributing temps + nc_dtemp_dtime_qc = pd.Series(nc_temp_qc).rolling(2).max().dropna().to_numpy() + + + # extract sensor nominal depth + nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH']) + + + # extract deployment name + nc_deployment = nc.deployment_code + + # Next step: append all this information to temp ensemble! + + nc.close() + + + + # pd.Series(lst).rolling(5).max().dropna().to_numpy() + + + + + \ No newline at end of file diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index 75fe6f4..c939838 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -40,7 +40,7 @@ start = time.time() # Set the working directory -os.chdir('/Users/tru050/Desktop/sofs6 test data') +#os.chdir('/Users/tru050/Desktop/sofs6 test data') # Make a list of FV00 filenames fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc') From 4a9590f5c8dfd95ccd2ad9d67baccdea4a097556 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 20 May 2020 10:15:26 +1000 Subject: [PATCH 40/59] Update temp_stat_plot.py Code loads and processes SOTS data for std calc --- ocean_dp/qc/temp_stat_plot.py | 189 ++++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 11 deletions(-) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index 752ebe5..ab93f2c 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -28,36 +28,100 @@ from sigfig import round import pandas as pd +############################# Data extraction ################################ + # creates an empty array to store the names of the SOTS deployments deployments = [] +checked_files = [] + +processed_files = [] + # loops through all the folders and files contained in the folder for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments - if (('Pulse' in x) or ('SOFS' in x)) and ('.' not in x): + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): deployments.append(x) # create a dataframe to store extract information -temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"]) +sots_temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"]) # loops through all files in the directory for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): for fname in files: - # for each netcdf file labelled as FV01 - if fname.endswith('.nc') and 'FV01' in fname: + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): # print the filename print(fname) # open the file - nc = Dataset(os.path.join(root,fname), mode = 'r') + nc = Dataset(os.path.join(root,fname), mode = 'a') + + # check file contains temperature data + if 'TEMP' in list(nc.variables): - # check that the in_out_water test has been run on the file - if 'TEMP_quality_control_io' in list(nc.variables): + # check that the in_out_water test has been run on the file, if not run in_out_water code + if not 'TEMP_quality_control_io' in list(nc.variables): + + # run in_out_water script - uncommented at this point as just copied and pasted + var_name = 'TEMP' + nc_vars = nc.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + print('deployment time', time_deploy) + + print(to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + if "TIME" in nc_vars[v].dimensions: + if v.endswith("_quality_control"): + print("QC time dim ", v) + + ncVarOut = nc_vars[v] + ncVarOut[mask] = 7 + else: + # create a qc variable just for this test flags + if v + "_quality_control_io" in nc.variables: + ncVarOut = nc.variables[v + "_quality_control_io"] + else: + ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_vars[v].shape) + ncVarOut.long_name = "quality flag for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ncVarOut[mask] = 7 + + nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:]) + + nc.file_version = "Level 1 - Quality Controlled Data" + + # check that the file has a single dimension temperature vector, and that the time format is correct if np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': @@ -86,22 +150,125 @@ nc_dtemp_dtime_qc = pd.Series(nc_temp_qc).rolling(2).max().dropna().to_numpy() + # extract sensor nominal depth nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH']) + # create a vector the same size as nc_dtemp_dtime with the nominal depth + nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dtemp_dtime)) + + # extract deployment name nc_deployment = nc.deployment_code - # Next step: append all this information to temp ensemble! + # create a list the same size as nc_dtemp_dtime with the deployment name + nc_deployment_list = [nc_deployment] * len(nc_dtemp_dtime) + + + + # combine information into an length x 4 dataframe + nc_temp_ensemble = pd.DataFrame({"Temp rate of change":nc_dtemp_dtime,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + + # append the current netcdf's dataframe to the sots_temp_ensemble + sots_temp_ensemble = sots_temp_ensemble.append(nc_temp_ensemble) + + # append the filename to the list of processed files + processed_files.append(fname) + nc.close() +############################# Data processing ################################ - # pd.Series(lst).rolling(5).max().dropna().to_numpy() - - +# creates a new dataframe containing only data with QC < 3 +sots_temp_ensemble_qc210 = sots_temp_ensemble[sots_temp_ensemble["QC"]<3] + +# calculates overall standard deviation +std_total = np.std(sots_temp_ensemble_qc210["Temp rate of change"]) + + + + +# creates an emply list to store data deployment by deployment +std_by_deployment_data = [] + +# creates a dict of deployment names and standard deviations +for i in sots_temp_ensemble_qc210.Deployment.unique(): + std_by_deployment_data.append( + { + 'Deployment': i, + 'STD': np.std(sots_temp_ensemble_qc210["Temp rate of change"][sots_temp_ensemble_qc210["Deployment"]==i]), + } + ) + +# creates a Dataframe from the dict +std_by_deployment = pd.DataFrame(std_by_deployment_data) + + + + + +# ============================================================================= +# std_by_depth: this function takes two compulsary arguments (top: the shallowest +# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: +# the deployment from which data will be taken). The function will return the standard +# deviation of the d(Temp)/d(Time) data from sensors with nominal depths at and +# between the two depths, and from only the deployment_in if specified. +# +# sample call: std_by_depth(500,10000,'SOFS-7.5-2018') +# +# this will give the std of all d(Temp)/d(Time) data from SOFS-7.5-2018 from +# sensors with 500m <= nominal depth <= 10000m +# ============================================================================= + +def std_by_depth(top,bottom,deployment_in=None): + + if deployment_in == None: + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)] + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble["Temp rate of change"]) + + # returns the standard deviation of the subsample + return target_std + + else: + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)] + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble["Temp rate of change"]) + + # returns the standard deviation of the subsample + return target_std + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From f1d5618bc204a64bb15c19a4d525832eee08bd58 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 20 May 2020 11:53:13 +1000 Subject: [PATCH 41/59] Update temp_stat_plot.py adding plotting to function and improved if else layout --- ocean_dp/qc/temp_stat_plot.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index ab93f2c..bc73767 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -230,22 +230,32 @@ def std_by_depth(top,bottom,deployment_in=None): # subsamples sots_temp_ensemble_qc210 based on depth target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)] - # calculates the standard deviation of the subsample - target_std = np.std(target_ensemble["Temp rate of change"]) - - # returns the standard deviation of the subsample - return target_std - else: # subsamples sots_temp_ensemble_qc210 based on depth target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)] - # calculates the standard deviation of the subsample - target_std = np.std(target_ensemble["Temp rate of change"]) + # calculates the mean of the subsample + target_mean = np.mean(target_ensemble["Temp rate of change"]) + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble["Temp rate of change"]) - # returns the standard deviation of the subsample - return target_std + line_thick = 1 + + ax_hist=plt.axes() + + target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist) + + ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) + + ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) + + + # returns the standard deviation of the subsample + return target_std + + From 528c5cf2de4062d76e59edf0524ee3405848a144 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 20 May 2020 11:54:25 +1000 Subject: [PATCH 42/59] Update temp_stat_plot.py --- ocean_dp/qc/temp_stat_plot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index bc73767..63673ba 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -241,12 +241,16 @@ def std_by_depth(top,bottom,deployment_in=None): # calculates the standard deviation of the subsample target_std = np.std(target_ensemble["Temp rate of change"]) + # sets line thickness for plot line_thick = 1 + # creates axes for histogram ax_hist=plt.axes() + # plots a histogram of the data selected target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist) + # draws lines at the mean +- 3 STD on the histogram ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) From 6f3748ecb6dcd061acae485cc099e5ad22fc5ded Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 20 May 2020 14:45:28 +1000 Subject: [PATCH 43/59] Update temp_stat_plot.py --- ocean_dp/qc/temp_stat_plot.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index 63673ba..fd0b86a 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -255,6 +255,30 @@ def std_by_depth(top,bottom,deployment_in=None): ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) + # sets the x label + ax_hist.set_xlabel('°C/hr') + + + label_coords = (0.65, 0.8) + label_method = 'axes fraction' + + anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) + + anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3)) + + anno += '\nno. samples = ' + str(len(target_ensemble)) + + anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m' + + if deployment_in == None: + + anno += '\nall available data' + + else: + + anno += '\n'+deployment_in + + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) # returns the standard deviation of the subsample return target_std From 7d28a8c13ceb46e64b14e82e9565502a4fed916c Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 20 May 2020 16:19:29 +1000 Subject: [PATCH 44/59] creation of psal stat plot --- ocean_dp/qc/psal_stat_plot.py | 310 ++++++++++++++++++++++++++++++++++ ocean_dp/qc/temp_stat_plot.py | 2 +- 2 files changed, 311 insertions(+), 1 deletion(-) create mode 100755 ocean_dp/qc/psal_stat_plot.py diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py new file mode 100755 index 0000000..8449b2a --- /dev/null +++ b/ocean_dp/qc/psal_stat_plot.py @@ -0,0 +1,310 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +############################# Data extraction ################################ + +# creates an empty array to store the names of the SOTS deployments +deployments = [] + +checked_files = [] + +processed_files = [] + +# loops through all the folders and files contained in the folder +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): + + deployments.append(x) + + +# create a dataframe to store extract information +sots_psal_ensemble = pd.DataFrame(columns = ["PSAL rate of change","QC","Nominal depth","Deployment"]) + + +# loops through all files in the directory +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'a') + + # check file contains psalerature data + if 'PSAL' in list(nc.variables): + + # check that the in_out_water test has been run on the file, if not run in_out_water code + if not 'PSAL_quality_control_io' in list(nc.variables): + + # run in_out_water script - uncommented at this point as just copied and pasted + var_name = 'PSAL' + nc_vars = nc.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + print('deployment time', time_deploy) + + print(to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + if "TIME" in nc_vars[v].dimensions: + if v.endswith("_quality_control"): + print("QC time dim ", v) + + ncVarOut = nc_vars[v] + ncVarOut[mask] = 7 + else: + # create a qc variable just for this test flags + if v + "_quality_control_io" in nc.variables: + ncVarOut = nc.variables[v + "_quality_control_io"] + else: + ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_vars[v].shape) + ncVarOut.long_name = "quality flag for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ncVarOut[mask] = 7 + + nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:]) + + nc.file_version = "Level 1 - Quality Controlled Data" + + + + # check that the file has a single dimension psalerature vector, and that the time format is correct + if np.array(nc.variables['PSAL'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # calculate psalerature changes for in water data + nc_psal_diffs = np.diff(np.array(nc.variables['PSAL'][np.array(nc.variables['PSAL_quality_control'][:])!=7])) + + # extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['PSAL_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes in hours + nc_time_hr_diffs = np.diff(nc_time_hr) + + # calculate the rate of change of psalerature wrt time (degrees °C per hour) + nc_dpsal_dtime = np.divide(nc_psal_diffs,nc_time_hr_diffs) + + + + # extract psal_qc data + nc_psal_qc = np.array(nc.variables['PSAL_quality_control'][np.array(nc.variables['PSAL_quality_control'][:])!=7]) + + # calculate qc values for each nc_dpsal_dtime by taking the maximum of the qc values of the two contributing psals + nc_dpsal_dtime_qc = pd.Series(nc_psal_qc).rolling(2).max().dropna().to_numpy() + + + + # extract sensor nominal depth + nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH']) + + # create a vector the same size as nc_dpsal_dtime with the nominal depth + nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dpsal_dtime)) + + + + # extract deployment name + nc_deployment = nc.deployment_code + + # create a list the same size as nc_dpsal_dtime with the deployment name + nc_deployment_list = [nc_deployment] * len(nc_dpsal_dtime) + + + + # combine information into an length x 4 dataframe + nc_psal_ensemble = pd.DataFrame({"Psal rate of change":nc_dpsal_dtime,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + + # append the current netcdf's dataframe to the sots_psal_ensemble + sots_psal_ensemble = sots_psal_ensemble.append(nc_psal_ensemble) + + # append the filename to the list of processed files + processed_files.append(fname) + + + nc.close() + + +############################# Data processing ################################ + +# creates a new dataframe containing only data with QC < 3 +sots_psal_ensemble_qc210 = sots_psal_ensemble[sots_psal_ensemble["QC"]<3] + +# calculates overall standard deviation +std_total = np.std(sots_psal_ensemble_qc210["Psal rate of change"]) + + + + +# creates an emply list to store data deployment by deployment +std_by_deployment_data = [] + +# creates a dict of deployment names and standard deviations +for i in sots_psal_ensemble_qc210.Deployment.unique(): + std_by_deployment_data.append( + { + 'Deployment': i, + 'STD': np.std(sots_psal_ensemble_qc210["Psal rate of change"][sots_psal_ensemble_qc210["Deployment"]==i]), + } + ) + +# creates a Dataframe from the dict +std_by_deployment = pd.DataFrame(std_by_deployment_data) + + + + + +# ============================================================================= +# std_by_depth: this function takes two compulsary arguments (top: the shallowest +# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: +# the deployment from which data will be taken). The function will return the standard +# deviation of the d(psal)/d(Time) data from sensors with nominal depths at and +# between the two depths, and from only the deployment_in if specified. +# +# sample call: std_by_depth(500,10000,'SOFS-7.5-2018') +# +# this will give the std of all d(psal)/d(Time) data from SOFS-7.5-2018 from +# sensors with 500m <= nominal depth <= 10000m +# ============================================================================= + +def std_by_depth_psal(top,bottom,deployment_in=None): + + if deployment_in == None: + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom)] + + else: + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210["Deployment"]==deployment_in)] + + # calculates the mean of the subsample + target_mean = np.mean(target_ensemble["Psal rate of change"]) + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble["Psal rate of change"]) + + # sets line thickness for plot + line_thick = 1 + + # creates axes for histogram + ax_hist=plt.axes() + + # plots a histogram of the data selected + target_ensemble.hist(column="Psal rate of change",bins=100,log=True,ax=ax_hist) + + # draws lines at the mean +- 3 STD on the histogram + ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) + + ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) + + # sets the x label + ax_hist.set_xlabel('PSU/hr') + + + label_coords = (0.65, 0.8) + label_method = 'axes fraction' + + anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) + + anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3)) + + anno += '\nno. samples = ' + str(len(target_ensemble)) + + anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m' + + if deployment_in == None: + + anno += '\nall available data' + + else: + + anno += '\n'+deployment_in + + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + + # returns the standard deviation of the subsample + return target_std + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index fd0b86a..6eda254 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -223,7 +223,7 @@ # sensors with 500m <= nominal depth <= 10000m # ============================================================================= -def std_by_depth(top,bottom,deployment_in=None): +def std_by_depth_temp(top,bottom,deployment_in=None): if deployment_in == None: From 56e469a954da103104a8089a40d7cf4f807bd3dc Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 21 May 2020 09:59:19 +1000 Subject: [PATCH 45/59] modified stat plot code -now swaps depths if inputted incorrectly -can now input list of deployments -fixed label location --- ocean_dp/qc/psal_stat_plot.py | 33 ++++++++++++++++++++++---- ocean_dp/qc/temp_stat_plot.py | 27 +++++++++++++++++++-- ocean_dp/sots_processing_runthrough.py | 2 +- 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py index 8449b2a..46999bc 100755 --- a/ocean_dp/qc/psal_stat_plot.py +++ b/ocean_dp/qc/psal_stat_plot.py @@ -228,15 +228,32 @@ def std_by_depth_psal(top,bottom,deployment_in=None): + # if user incorrectly inputs depths, swap them and run the code + if top > bottom: + + top, bottom = bottom, top + if deployment_in == None: # subsamples sots_psal_ensemble_qc210 based on depth target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom)] + elif isinstance(deployment_in, list): + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210.Deployment.isin(deployment_in))] + + else: # subsamples sots_psal_ensemble_qc210 based on depth target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210["Deployment"]==deployment_in)] + + # if not data is available with the given choices, end the function + if len(target_ensemble)==0: + + return 'No data available for those choices' + # calculates the mean of the subsample target_mean = np.mean(target_ensemble["Psal rate of change"]) @@ -262,7 +279,7 @@ def std_by_depth_psal(top,bottom,deployment_in=None): ax_hist.set_xlabel('PSU/hr') - label_coords = (0.65, 0.8) + label_coords = (0.70, 0.99) label_method = 'axes fraction' anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) @@ -277,19 +294,27 @@ def std_by_depth_psal(top,bottom,deployment_in=None): anno += '\nall available data' + elif isinstance(deployment_in, list): + + anno += '\n' + + anno += '\n'.join(deployment_in) + else: anno += '\n'+deployment_in - ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left") # returns the standard deviation of the subsample return target_std - - + + + + diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index 6eda254..9efd849 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -225,15 +225,32 @@ def std_by_depth_temp(top,bottom,deployment_in=None): + # if user incorrectly inputs depths, swap them and run the code + if top > bottom: + + top, bottom = bottom, top + if deployment_in == None: # subsamples sots_temp_ensemble_qc210 based on depth target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)] + elif isinstance(deployment_in, list): + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210.Deployment.isin(deployment_in))] + + else: # subsamples sots_temp_ensemble_qc210 based on depth target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)] + + # if not data is available with the given choices, end the function + if len(target_ensemble)==0: + + return 'No data available for those choices' + # calculates the mean of the subsample target_mean = np.mean(target_ensemble["Temp rate of change"]) @@ -259,7 +276,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None): ax_hist.set_xlabel('°C/hr') - label_coords = (0.65, 0.8) + label_coords = (0.70, 0.99) label_method = 'axes fraction' anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) @@ -274,11 +291,17 @@ def std_by_depth_temp(top,bottom,deployment_in=None): anno += '\nall available data' + elif isinstance(deployment_in, list): + + anno += '\n' + + anno += '\n'.join(deployment_in) + else: anno += '\n'+deployment_in - ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left") # returns the standard deviation of the subsample return target_std diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index c939838..cc7bc30 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -40,7 +40,7 @@ start = time.time() # Set the working directory -#os.chdir('/Users/tru050/Desktop/sofs6 test data') +#os.chdir('‎/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/SOFS-5-2015') # Make a list of FV00 filenames fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc') From 4317aff5f13bead9c778a4c68ec512b03436e61d Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 21 May 2020 13:16:29 +1000 Subject: [PATCH 46/59] Update temp_stat_plot.py adds per sample functionality --- ocean_dp/qc/temp_stat_plot.py | 41 +++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index 9efd849..bc1020d 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -46,7 +46,7 @@ deployments.append(x) # create a dataframe to store extract information -sots_temp_ensemble = pd.DataFrame(columns = ["Temp rate of change","QC","Nominal depth","Deployment"]) +sots_temp_ensemble = pd.DataFrame(columns = ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"]) # loops through all files in the directory for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): @@ -168,7 +168,7 @@ # combine information into an length x 4 dataframe - nc_temp_ensemble = pd.DataFrame({"Temp rate of change":nc_dtemp_dtime,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + nc_temp_ensemble = pd.DataFrame({"dTemp/dtime":nc_dtemp_dtime,"dTemp/dSample":nc_temp_diffs,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) # append the current netcdf's dataframe to the sots_temp_ensemble sots_temp_ensemble = sots_temp_ensemble.append(nc_temp_ensemble) @@ -186,7 +186,7 @@ sots_temp_ensemble_qc210 = sots_temp_ensemble[sots_temp_ensemble["QC"]<3] # calculates overall standard deviation -std_total = np.std(sots_temp_ensemble_qc210["Temp rate of change"]) +std_time_total = np.std(sots_temp_ensemble_qc210["dTemp/dtime"]) @@ -199,7 +199,8 @@ std_by_deployment_data.append( { 'Deployment': i, - 'STD': np.std(sots_temp_ensemble_qc210["Temp rate of change"][sots_temp_ensemble_qc210["Deployment"]==i]), + 'STD time': np.std(sots_temp_ensemble_qc210["dTemp/dtime"][sots_temp_ensemble_qc210["Deployment"]==i]), + 'STD sample': np.std(sots_temp_ensemble_qc210["dTemp/dSample"][sots_temp_ensemble_qc210["Deployment"]==i]) } ) @@ -223,7 +224,23 @@ # sensors with 500m <= nominal depth <= 10000m # ============================================================================= -def std_by_depth_temp(top,bottom,deployment_in=None): +def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'): + + selection = '' + + if rate == 'time': + + selection = "dTemp/dtime" + + elif rate == 'sample': + + selection = "dTemp/dSample" + + else: + + return "incorrect rate specification" + + # if user incorrectly inputs depths, swap them and run the code if top > bottom: @@ -253,10 +270,10 @@ def std_by_depth_temp(top,bottom,deployment_in=None): # calculates the mean of the subsample - target_mean = np.mean(target_ensemble["Temp rate of change"]) + target_mean = np.mean(target_ensemble[selection]) # calculates the standard deviation of the subsample - target_std = np.std(target_ensemble["Temp rate of change"]) + target_std = np.std(target_ensemble[selection]) # sets line thickness for plot line_thick = 1 @@ -265,7 +282,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None): ax_hist=plt.axes() # plots a histogram of the data selected - target_ensemble.hist(column="Temp rate of change",bins=100,log=True,ax=ax_hist) + target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist) # draws lines at the mean +- 3 STD on the histogram ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) @@ -273,7 +290,13 @@ def std_by_depth_temp(top,bottom,deployment_in=None): ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) # sets the x label - ax_hist.set_xlabel('°C/hr') + if rate == 'time': + + ax_hist.set_xlabel('°C/hr') + + elif rate == 'sample': + + ax_hist.set_xlabel('°C') label_coords = (0.70, 0.99) From f6b597fe273c9b3cfd65ca00daea4dec3fc008b3 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 25 May 2020 15:29:52 +1000 Subject: [PATCH 47/59] update to stat plots changed labels, added per sample functionality to psal plotter --- ocean_dp/plotting/batch-qc.pdf | Bin 0 -> 208 bytes ocean_dp/plotting/plotQC.py | 4 +-- ocean_dp/qc/psal_stat_plot.py | 42 +++++++++++++++++++------ ocean_dp/qc/temp_stat_plot.py | 9 +++++- ocean_dp/sots_processing_runthrough.py | 4 +-- 5 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 ocean_dp/plotting/batch-qc.pdf diff --git a/ocean_dp/plotting/batch-qc.pdf b/ocean_dp/plotting/batch-qc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5e7a87baea6886313712bca3fae565bb0266a9f5 GIT binary patch literal 208 zcmY!laB$t4Qr JV8ap40RR*-I}iW> literal 0 HcmV?d00001 diff --git a/ocean_dp/plotting/plotQC.py b/ocean_dp/plotting/plotQC.py index 3060759..979976a 100755 --- a/ocean_dp/plotting/plotQC.py +++ b/ocean_dp/plotting/plotQC.py @@ -67,11 +67,11 @@ def do_plot(fn): DS = xr.open_dataset(fn) ax1 = plt.subplot(2, 1, 1) - plt.plot(DS.TIME, DS.PAR) + plt.plot(DS.TIME, DS.TEMP) plt.title(DS.deployment_code + " - " + DS.instrument_model + ":" + DS.instrument_serial_number + " @ " + str(DS.instrument_nominal_depth), {'fontsize': 8}) ax2 = plt.subplot(2, 1, 2, sharex=ax1) - aux = DS.PAR.ancillary_variables + aux = DS.TEMP.ancillary_variables a_vars = aux.split(" ") for f in sorted(set(a_vars)): print('aux var', f) diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py index 46999bc..cb9cbc7 100755 --- a/ocean_dp/qc/psal_stat_plot.py +++ b/ocean_dp/qc/psal_stat_plot.py @@ -48,7 +48,7 @@ # create a dataframe to store extract information -sots_psal_ensemble = pd.DataFrame(columns = ["PSAL rate of change","QC","Nominal depth","Deployment"]) +sots_psal_ensemble = pd.DataFrame(columns = ["dPsal/dtime","dPsal/dSample","QC","Nominal depth","Deployment"]) # loops through all files in the directory @@ -171,7 +171,7 @@ # combine information into an length x 4 dataframe - nc_psal_ensemble = pd.DataFrame({"Psal rate of change":nc_dpsal_dtime,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + nc_psal_ensemble = pd.DataFrame({"dPsal/dtime":nc_dpsal_dtime,"dPsal/dSample":nc_psal_diffs,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) # append the current netcdf's dataframe to the sots_psal_ensemble sots_psal_ensemble = sots_psal_ensemble.append(nc_psal_ensemble) @@ -189,7 +189,7 @@ sots_psal_ensemble_qc210 = sots_psal_ensemble[sots_psal_ensemble["QC"]<3] # calculates overall standard deviation -std_total = np.std(sots_psal_ensemble_qc210["Psal rate of change"]) +std_total = np.std(sots_psal_ensemble_qc210["dPsal/dtime"]) @@ -202,7 +202,8 @@ std_by_deployment_data.append( { 'Deployment': i, - 'STD': np.std(sots_psal_ensemble_qc210["Psal rate of change"][sots_psal_ensemble_qc210["Deployment"]==i]), + 'STD': np.std(sots_psal_ensemble_qc210["dPsal/dtime"][sots_psal_ensemble_qc210["Deployment"]==i]), + 'STD sample': np.std(sots_psal_ensemble_qc210["dPsal/dSample"][sots_psal_ensemble_qc210["Deployment"]==i]) } ) @@ -226,7 +227,23 @@ # sensors with 500m <= nominal depth <= 10000m # ============================================================================= -def std_by_depth_psal(top,bottom,deployment_in=None): +def std_by_depth_psal(top,bottom,deployment_in=None,rate='time'): + + selection = '' + + if rate == 'time': + + selection = "dPsal/dtime" + + elif rate == 'sample': + + selection = "dPsal/dSample" + + else: + + return "incorrect rate specification" + + # if user incorrectly inputs depths, swap them and run the code if top > bottom: @@ -256,10 +273,10 @@ def std_by_depth_psal(top,bottom,deployment_in=None): # calculates the mean of the subsample - target_mean = np.mean(target_ensemble["Psal rate of change"]) + target_mean = np.mean(target_ensemble[selection]) # calculates the standard deviation of the subsample - target_std = np.std(target_ensemble["Psal rate of change"]) + target_std = np.std(target_ensemble[selection]) # sets line thickness for plot line_thick = 1 @@ -268,7 +285,7 @@ def std_by_depth_psal(top,bottom,deployment_in=None): ax_hist=plt.axes() # plots a histogram of the data selected - target_ensemble.hist(column="Psal rate of change",bins=100,log=True,ax=ax_hist) + target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist) # draws lines at the mean +- 3 STD on the histogram ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) @@ -276,7 +293,13 @@ def std_by_depth_psal(top,bottom,deployment_in=None): ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) # sets the x label - ax_hist.set_xlabel('PSU/hr') + if rate == 'time': + + ax_hist.set_xlabel('PSU/hr') + + elif rate == 'sample': + + ax_hist.set_xlabel('PSU/sample') label_coords = (0.70, 0.99) @@ -314,7 +337,6 @@ def std_by_depth_psal(top,bottom,deployment_in=None): - diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py index bc1020d..5b31a82 100755 --- a/ocean_dp/qc/temp_stat_plot.py +++ b/ocean_dp/qc/temp_stat_plot.py @@ -28,6 +28,12 @@ from sigfig import round import pandas as pd +import warnings +import scipy.stats as st +import statsmodels as sm +import matplotlib + + ############################# Data extraction ################################ # creates an empty array to store the names of the SOTS deployments @@ -296,7 +302,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'): elif rate == 'sample': - ax_hist.set_xlabel('°C') + ax_hist.set_xlabel('°C/sample') label_coords = (0.70, 0.99) @@ -351,6 +357,7 @@ def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'): + diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index cc7bc30..c389d27 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -70,13 +70,13 @@ global_range.global_range(ncfile,'TEMP',40,-2) # Rate of change -rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',10) +rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',3.36) # Spike spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP']) # Flatline -flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=10) +flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=20) # Check qc process has worked fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP']) From d3839b146110f506a63722f2592e190f3b819b60 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Wed, 27 May 2020 16:53:31 +1000 Subject: [PATCH 48/59] Create add_density.py --- ocean_dp/processing/add_density.py | 78 ++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 ocean_dp/processing/add_density.py diff --git a/ocean_dp/processing/add_density.py b/ocean_dp/processing/add_density.py new file mode 100755 index 0000000..6e56a53 --- /dev/null +++ b/ocean_dp/processing/add_density.py @@ -0,0 +1,78 @@ +# Copyright (C) 2020 Ben Weeding and Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +from netCDF4 import Dataset +import sys +import gsw +import numpy as np +from datetime import datetime + +# add density to a data file with TEMP, PSAL, PRES variables, many assumptions are made about the input file +# based on Peter Jansen's addPSAL.py, using TEOS-10 + +def add_density(netCDFfile): + + # loads the netcdf file + ds = Dataset(netCDFfile, 'a') + + if 'DENSITY' in list(ds.variables): + + ds.close() + + return "file already contains density" + + # extracts the variables from the netcdf + var_temp = ds.variables["TEMP"] + var_psal = ds.variables["PSAL"] + var_pres = ds.variables["PRES"] + var_lon = ds.variables["LONGITUDE"] + var_lat = ds.variables["LATITUDE"] + + # extracts the data from the variables + t = var_temp[:] + psal = var_psal[:] + p = var_pres[:] + lon = var_lon[:] + lat = var_lat[:] + + # calculates absolute salinity + SA = gsw.SA_from_SP(psal, p, lon, lat) + + # calculates conservative temperature + CT = gsw.CT_from_t(SA, t, p) + + # calculates density + density = gsw.rho(SA, CT, p) + + + ncVarOut = ds.createVariable("DENSITY", "f4", ("TIME",), fill_value=np.nan, zlib=True) # fill_value=nan otherwise defaults to max + ncVarOut[:] = density + ncVarOut.units = "kg/m^3" + ncVarOut.comment = "calculated using gsw-python https://teos-10.github.io/GSW-Python/index.html" + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added DENSITY from TEMP, PSAL, PRES, LAT, LON") + + ds.close() + + +if __name__ == "__main__": + add_density(sys.argv[1]) From 031fd0e9e0d022b697448cef3967c9784b8ac909 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 28 May 2020 16:11:27 +1000 Subject: [PATCH 49/59] Making Pandas Don't know what zoos are complaining about --- ocean_dp/file_name/find_file_with.py | 90 +++++++++++++++++++ ocean_dp/plotting/density_plot.py | 25 ++++++ ocean_dp/plotting/panda_maker.py | 126 +++++++++++++++++++++++++++ ocean_dp/processing/add_density.py | 4 +- 4 files changed, 244 insertions(+), 1 deletion(-) create mode 100755 ocean_dp/file_name/find_file_with.py create mode 100755 ocean_dp/plotting/density_plot.py create mode 100755 ocean_dp/plotting/panda_maker.py diff --git a/ocean_dp/file_name/find_file_with.py b/ocean_dp/file_name/find_file_with.py new file mode 100755 index 0000000..59225b3 --- /dev/null +++ b/ocean_dp/file_name/find_file_with.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import glob + +import sys +import re + +from netCDF4 import Dataset + + +def find_files_pattern(file_pattern): + match_files = [] + files = glob.glob(file_pattern) + + match_files.extend(files) + return match_files + +def find_global(files, attribute, regexp): + + match_files = [] + #print("find", file_pattern, files) + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + if attribute in ds.ncattrs(): + if re.match(regexp, ds.getncattr(attribute)): + match_files.append(f) + ds.close() + + return match_files + + +def find_variable(files, variable): + + match_files = [] + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + if variable in ds.variables: + match_files.append(f) + ds.close() + + return match_files + + +def find_variable_attribute(files, attribute, value): + + match_files = [] + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + nv = {attribute : value} + find = ds.get_variables_by_attributes(**nv) + if len(find) > 0: + match_files.append(f) + ds.close() + + return match_files + + +if __name__ == "__main__": + fns = [] + if sys.argv[1] == '-v': + files = find_files_pattern(sys.argv[3]) + fns = find_variable(files, variable=sys.argv[2]) + elif sys.argv[1] == '-a': + files = find_files_pattern(sys.argv[4]) + fns = find_variable_attribute(files, attribute=sys.argv[2], value=sys.argv[3]) + else: + files = find_files_pattern(sys.argv[4]) + fns = find_global(files, attribute=sys.argv[1], regexp=sys.argv[2]) + + for f in fns: + print(f) \ No newline at end of file diff --git a/ocean_dp/plotting/density_plot.py b/ocean_dp/plotting/density_plot.py new file mode 100755 index 0000000..a798b02 --- /dev/null +++ b/ocean_dp/plotting/density_plot.py @@ -0,0 +1,25 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import sys +import os + +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/file_name') + +import find_file_with + +path = "/Users/Tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data" + +sots_files = find_file_with.find_files_pattern(os.path.join(path, "IMOS*FV00*.nc")) \ No newline at end of file diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py new file mode 100755 index 0000000..b21add7 --- /dev/null +++ b/ocean_dp/plotting/panda_maker.py @@ -0,0 +1,126 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +import warnings +import scipy.stats as st +import statsmodels as sm +import matplotlib + +# this function creates a pandas Datatable object, searching through all the +# netcdf files in the directory given, containing all the variables specified + +# "/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data" + +# ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"] + +# qc selection!!! + +def panda_maker(dir_spec,var_list,qc_lim=2): + + # creates an empty array to store the names of the SOTS deployments + deployments = [] + + checked_files = [] + + processed_files = [] + + # loops through all the folders and files contained in the folder + for x in os.listdir(dir_spec): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): + + deployments.append(x) + + + + # create a dataframe to store extract information + total_df = pd.DataFrame(columns = var_list) + + # add deployment code to the dataframe + total_df.insert(len(var_list),'Deployment code',[]) + + # loops through all files in the directory + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'r') + + # check file contains all the specified variables and the time format is correct + if (all(ele in list(nc.variables) for ele in var_list)) & (nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC'): + + # create a current dataframe for the netcdf file, to be appended to the overall dataframe + cur_df = pd.DataFrame(columns=var_list) + + # create a qc vector for the netcdf file + cur_qc = np.zeros(nc.variables["TIME"].shape) + + for cur_var in var_list: + + if np.array(nc.variables[cur_var]).size == 1: + + filling = np.ones(nc.variables["TIME"].shape) * np.array(nc.variables[cur_var]) + + else: + + filling = np.array(nc.variables[cur_var][:]) + + if cur_var + '_quality_control' in list(nc.variables): + + cur_qc = np.maximum(cur_qc,np.array(nc.variables[cur_var + '_quality_control'])) + + + cur_df[cur_var] = filling + + cur_df['Deployment code'] = [nc.deployment_code] * len(np.array(nc.variables['TIME'])) + + # append the current netcdf's dataframe to the sots_temp_ensemble + total_df = total_df.append(cur_df.iloc[np.where(cur_qc<=qc_lim)]) + + # append the filename to the list of processed files + processed_files.append(fname) + + + nc.close() + + + return total_df + \ No newline at end of file diff --git a/ocean_dp/processing/add_density.py b/ocean_dp/processing/add_density.py index 6e56a53..9320ddf 100755 --- a/ocean_dp/processing/add_density.py +++ b/ocean_dp/processing/add_density.py @@ -57,8 +57,10 @@ def add_density(netCDFfile): # calculates density density = gsw.rho(SA, CT, p) - + # generates a new variable 'DENSITY' in the netcdf ncVarOut = ds.createVariable("DENSITY", "f4", ("TIME",), fill_value=np.nan, zlib=True) # fill_value=nan otherwise defaults to max + + # assigns the calculated densities to the DENSITY variable, sets the units as kg/m^3, and comments on the variable's origin ncVarOut[:] = density ncVarOut.units = "kg/m^3" ncVarOut.comment = "calculated using gsw-python https://teos-10.github.io/GSW-Python/index.html" From 3d32c27082ae4fa98c06d7d7427a88aab4eb19b2 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 28 May 2020 16:19:49 +1000 Subject: [PATCH 50/59] Update panda_maker.py --- ocean_dp/plotting/panda_maker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py index b21add7..5c3aa70 100755 --- a/ocean_dp/plotting/panda_maker.py +++ b/ocean_dp/plotting/panda_maker.py @@ -121,6 +121,8 @@ def panda_maker(dir_spec,var_list,qc_lim=2): nc.close() + total_df = total_df.reset_index() + return total_df \ No newline at end of file From 11b897320e45f92d7a6b1638a931254b0864d879 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 28 May 2020 16:22:22 +1000 Subject: [PATCH 51/59] Update panda_maker.py --- ocean_dp/plotting/panda_maker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py index 5c3aa70..2fc9bfb 100755 --- a/ocean_dp/plotting/panda_maker.py +++ b/ocean_dp/plotting/panda_maker.py @@ -121,7 +121,7 @@ def panda_maker(dir_spec,var_list,qc_lim=2): nc.close() - total_df = total_df.reset_index() + total_df = total_df.reset_index(drop=True) return total_df From d75303b06ab44354b656b30b1f760f6574092c27 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 29 May 2020 13:50:12 +1000 Subject: [PATCH 52/59] Create panda_merger.py --- ocean_dp/processing/panda_merger.py | 73 +++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 ocean_dp/processing/panda_merger.py diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py new file mode 100755 index 0000000..1425f5c --- /dev/null +++ b/ocean_dp/processing/panda_merger.py @@ -0,0 +1,73 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +from datetime import datetime as dt +from datetime import timedelta +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r') + +nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') + +df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP':np.array(nc1.variables['TEMP'][:])}) + +df1_time = np.array(nc1.variables['TIME'][:]) + +df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP':np.array(nc2.variables['TEMP'][:])}) + +# convert datenums to datetimes + +pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) + +df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) + +df1a=df1.set_index('TIME') + +# resample hourly + +df1h=df1a.resample('H',base=0.5).mean() + +df1h.index = df1h.index + pd.Timedelta('30 min') + + + +# merge + + + + + + + + + + + + + + From 1c07d8cc57efd37a82324a47ed9ea9081fbc1a85 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 29 May 2020 14:27:05 +1000 Subject: [PATCH 53/59] Update panda_merger.py --- ocean_dp/processing/panda_merger.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py index 1425f5c..6f69822 100755 --- a/ocean_dp/processing/panda_merger.py +++ b/ocean_dp/processing/panda_merger.py @@ -30,33 +30,40 @@ from sigfig import round import pandas as pd +# import two netcdf nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r') nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') -df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP':np.array(nc1.variables['TEMP'][:])}) +# convert their time and temp data into dataframes +df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_200':np.array(nc1.variables['TEMP'][:])}) -df1_time = np.array(nc1.variables['TIME'][:]) +df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_45':np.array(nc2.variables['TEMP'][:])}) -df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP':np.array(nc2.variables['TEMP'][:])}) +# convert the times from days since 01-01-1950 to a datetime object +df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) -# convert datenums to datetimes +df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1) -pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) +# index the dataframes by time +df1=df1.set_index('TIME') -df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) +df2=df2.set_index('TIME') -df1a=df1.set_index('TIME') -# resample hourly +# resample the data, calculating the mean over hourly periods, starting on the half hour +df1=df1.resample('H',base=0.5).mean() -df1h=df1a.resample('H',base=0.5).mean() +df2=df2.resample('H',base=0.5).mean() -df1h.index = df1h.index + pd.Timedelta('30 min') +# reset the labels so they read the hour in the centre of the averaging period +df1.index = df1.index + pd.Timedelta('30 min') +df2.index = df2.index + pd.Timedelta('30 min') -# merge +# combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp +total_df = pd.concat([df1,df2], join='outer', axis=1) From 19a80341ed3d41e60d18b33255c1c47e758a2acc Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 29 May 2020 14:28:23 +1000 Subject: [PATCH 54/59] Update panda_merger.py --- ocean_dp/processing/panda_merger.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py index 6f69822..69de3f6 100755 --- a/ocean_dp/processing/panda_merger.py +++ b/ocean_dp/processing/panda_merger.py @@ -20,14 +20,7 @@ from datetime import datetime as dt from datetime import timedelta import numpy as np -import argparse -import glob -import pytz -import os -import matplotlib.pyplot as plt -from matplotlib import colors -from matplotlib.ticker import PercentFormatter -from sigfig import round + import pandas as pd # import two netcdf From fe99b4eebf283ba9faa2c1f3cab31be1d4724147 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Fri, 29 May 2020 15:46:23 +1000 Subject: [PATCH 55/59] Update panda_merger.py --- ocean_dp/processing/panda_merger.py | 38 ++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py index 69de3f6..495b906 100755 --- a/ocean_dp/processing/panda_merger.py +++ b/ocean_dp/processing/panda_merger.py @@ -20,7 +20,7 @@ from datetime import datetime as dt from datetime import timedelta import numpy as np - +import glob import pandas as pd # import two netcdf @@ -29,9 +29,9 @@ nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') # convert their time and temp data into dataframes -df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_200':np.array(nc1.variables['TEMP'][:])}) +df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])}) -df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_45':np.array(nc2.variables['TEMP'][:])}) +df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])}) # convert the times from days since 01-01-1950 to a datetime object df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) @@ -59,7 +59,37 @@ total_df = pd.concat([df1,df2], join='outer', axis=1) - +files = glob.glob('*FV00*.nc') + +var_name = 'TEMP' + +def panda_combine(files,var_name): + + total_df = pd.DataFrame({'A' : []}) + + # make a sorting index for columns from nominal depths + + for cur_file in files: + + cur_nc = Dataset(cur_file,mode='r') + + cur_df = pd.DataFrame({'TIME':np.array(cur_nc.variables['TIME'][:]),var_name+'_'+str(cur_nc.variables['NOMINAL_DEPTH'][0]):np.array(cur_nc.variables[var_name][:])}) + + cur_df['TIME']=pd.to_timedelta(cur_df['TIME'],unit='D')+dt(1950,1,1) + + cur_df = cur_df.set_index('TIME') + + cur_df = cur_df.resample('H',base=0.5).mean() + + cur_df.index = cur_df.index + pd.Timedelta('30 min') + + total_df = pd.concat([total_df,cur_df], join='outer', axis=1) + + print(cur_file) + + print(len(total_df)) + + return total_df From afeb0670d6a2b8c809fb2c01f54fd96d90b14971 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Mon, 1 Jun 2020 15:28:35 +1000 Subject: [PATCH 56/59] beginning of netcdf to panda converter --- ocean_dp/processing/netcdf_to_df.py | 94 +++++++++++++++++++++++++++++ ocean_dp/processing/panda_merger.py | 93 ++++++++++++++++++---------- 2 files changed, 155 insertions(+), 32 deletions(-) create mode 100755 ocean_dp/processing/netcdf_to_df.py diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py new file mode 100755 index 0000000..dc1d6fa --- /dev/null +++ b/ocean_dp/processing/netcdf_to_df.py @@ -0,0 +1,94 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset, num2date +import pandas as pd +import numpy as np +from datetime import datetime as dt + + +# ============================================================================= +# Returns a list of the time series variables in a IMOS format +# netcdf file. Takes an open netcdf as its argument. +# ============================================================================= +def var_selector_inc_time(nc,qc=False,): + + x = [x for x in list(nc.variables) if ('_quality_control_' not in x) & (nc.variables[x].shape!=())] + + return x + +# ============================================================================= +# +# ============================================================================= +def netcdf_to_df(target_file): + + # open the inputted netcdf + nc = Dataset(target_file,mode='r') + + # creates a the list of variables to transfer to the dataframe + vars_to_transfer = var_selector_inc_time(nc) + + # creates the dataframe with column labels + df = pd.DataFrame(columns = vars_to_transfer) + + # sorts the columns alphabetically, with the relevant qc variable following each timeseries variable + df.sort_index(axis=1, inplace=True) + + # fill the dataframe from the netcdf, variable by variable + for cur_var in vars_to_transfer: + + df[cur_var] = np.array(nc.variables[cur_var]) + + # convert time into a datetime object, this is optional, and not needed to continue in the IMOS format + #df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1) + + # index the dataframe by time + df = df.set_index('TIME') + + # extract the column names + col_names = list(df.columns) + + # append the nominal depth to all column names + df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py index 495b906..13a789c 100755 --- a/ocean_dp/processing/panda_merger.py +++ b/ocean_dp/processing/panda_merger.py @@ -22,50 +22,34 @@ import numpy as np import glob import pandas as pd +import re -# import two netcdf -nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r') -nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') -# convert their time and temp data into dataframes -df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])}) -df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])}) - -# convert the times from days since 01-01-1950 to a datetime object -df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) - -df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1) - -# index the dataframes by time -df1=df1.set_index('TIME') - -df2=df2.set_index('TIME') - - -# resample the data, calculating the mean over hourly periods, starting on the half hour -df1=df1.resample('H',base=0.5).mean() - -df2=df2.resample('H',base=0.5).mean() - -# reset the labels so they read the hour in the centre of the averaging period -df1.index = df1.index + pd.Timedelta('30 min') - -df2.index = df2.index + pd.Timedelta('30 min') +files = glob.glob('*FV00*.nc') +var_name = 'TEMP' -# combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp -total_df = pd.concat([df1,df2], join='outer', axis=1) +def depth_from_file(file_in): + + result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0]) + + return result +def var_selector(file): + + [x for x in var_list if (x!='TIME') & ('_quality_control' not in x) & (nc.variables[x].shape!=())] + + return x -files = glob.glob('*FV00*.nc') -var_name = 'TEMP' def panda_combine(files,var_name): - total_df = pd.DataFrame({'A' : []}) + files.sort(key=depth_from_file) + + total_df = pd.DataFrame({'dummy' : []}) # make a sorting index for columns from nominal depths @@ -88,12 +72,57 @@ def panda_combine(files,var_name): print(cur_file) print(len(total_df)) + + total_df.drop(['dummy'],axis=1,inplace=True) return total_df +result = re.findall(r'(?<=-)\w+(?=m_END)', cur_file) + + + + + + +# ============================================================================= +# Old proof of concept code +# ============================================================================= + +# # import two netcdf +# nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r') + +# nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') + +# # convert their time and temp data into dataframes +# df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])}) + +# df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])}) + +# # convert the times from days since 01-01-1950 to a datetime object +# df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) + +# df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1) + +# # index the dataframes by time +# df1=df1.set_index('TIME') + +# df2=df2.set_index('TIME') + + +# # resample the data, calculating the mean over hourly periods, starting on the half hour +# df1=df1.resample('H',base=0.5).mean() + +# df2=df2.resample('H',base=0.5).mean() + +# # reset the labels so they read the hour in the centre of the averaging period +# df1.index = df1.index + pd.Timedelta('30 min') + +# df2.index = df2.index + pd.Timedelta('30 min') +# # combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp +# total_df = pd.concat([df1,df2], join='outer', axis=1) From b43195f3ff55166fec8847ce5980cfeb035011a8 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Tue, 2 Jun 2020 16:13:46 +1000 Subject: [PATCH 57/59] netcdf_to_df progress --- ocean_dp/processing/netcdf_to_df.py | 87 +++++++++++++++++-- ocean_dp/qc/in_out_water.py | 127 +++++++++++++++++----------- 2 files changed, 158 insertions(+), 56 deletions(-) diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py index dc1d6fa..e86b158 100755 --- a/ocean_dp/processing/netcdf_to_df.py +++ b/ocean_dp/processing/netcdf_to_df.py @@ -49,22 +49,93 @@ def netcdf_to_df(target_file): # fill the dataframe from the netcdf, variable by variable for cur_var in vars_to_transfer: - df[cur_var] = np.array(nc.variables[cur_var]) - - # convert time into a datetime object, this is optional, and not needed to continue in the IMOS format - #df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1) + df[cur_var] = np.array(nc.variables[cur_var]) - # index the dataframe by time - df = df.set_index('TIME') + # store deployment times in attributes + df.attrs['time_deployment_start'] = nc.time_deployment_start + df.attrs['time_deployment_end'] = nc.time_deployment_end # extract the column names col_names = list(df.columns) # append the nominal depth to all column names - df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names] - + df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x if 'TIME' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names] + + nc.close() + + return df + + +# ============================================================================= +# Takes +# ============================================================================= + +def combine_df(target_dfs): + + # for each of the dataframes in the list provided + for cur_df in target_dfs: + + # make a copy of the current dataframe to modify and combine + df = cur_df.copy() + + # convert the IMOS format times to datetime + df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1) + + # index the dataframe by time - for some reason this makes the df very slow to visually open and navigate!? + df = df.set_index('TIME') + + # extract and convert deployment times to datetime + start_time = dt.strptime(df.attrs['time_deployment_start'],'%Y-%m-%dT%H:%M:%SZ') + end_time = dt.strptime(df.attrs['time_deployment_end'],'%Y-%m-%dT%H:%M:%SZ') + + # trim the df to only include in water data + df = df.drop(df[(df.index < start_time) | (df.index > end_time)].index) + + # resamples using the max method, to create a df of the correct dimensions to fill + df_to_fill = df.resample('H',base=0.5).max() + + + # gets list of column names + col_names = list(df.columns) + # makes a list of non qc column names + col_names_no_qc = [x for x in col_names if 'quality_control' not in x] + # for each of the time series data columns + for cur_col in col_names_no_qc: + + # sets the value of non qc data to nan if the corresponding qc value is not satisfactory (0,1,2,7 at the moment) + df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan + + # extracts the time series data + dS = pd.Series(df[cur_col]) + + # makes a copy for bin counting + dS_1s = dS.copy() + + dS_1s[:] = 1 + + # resamples the series, interpoling linearly + dS_resampled = dS.resample('H',base=0.5).interpolate() + + # count how many data points are in each shoulder bin + dS_bin_counts = dS_1s.resample('H',base=0.5).sum() + + # fill the interpolated data back into the dataframe + df_to_fill[cur_col] = dS_resampled + + # give any interpolated point without any data within its hour window a qc code of 7 + df_to_fill.loc[dS_bin_counts==0,[cur_col+'_quality_control']] = 7 + + # shift the timestamps to the middle of the hour sampling period + df_to_fill.index = df_to_fill.index + pd.Timedelta('30 min') + + + + + + + diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py index e8635ed..39da3a7 100644 --- a/ocean_dp/qc/in_out_water.py +++ b/ocean_dp/qc/in_out_water.py @@ -18,73 +18,104 @@ from netCDF4 import Dataset, num2date import sys - +from datetime import datetime import numpy as np from dateutil import parser import pytz import os -# flag out of water as QC value 7 (not_deployed), with wise leave as 0 +# flag out of water as QC value 6 (not_deployed), with wise leave as 0 def in_out_water(netCDFfile, var_name=None): - ds = Dataset(netCDFfile, 'a') - - nc_vars = ds.variables - to_add = [] - if var_name: - to_add.append(var_name) - else: - for v in nc_vars: - #print (vars[v].dimensions) - if v != 'TIME': - to_add.append(v) - - time_var = nc_vars["TIME"] - time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) - - time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) - time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) - print('deployment time', time_deploy) + out_file = [] + + for fn in netCDFfile: + ds = Dataset(fn, 'a') + + nc_vars = ds.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + if "TIME" in nc_vars[v].dimensions: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + # remove any anx variables from the list + for v in nc_vars: + if 'ancillary_variables' in nc_vars[v].ncattrs(): + remove = nc_vars[v].getncattr('ancillary_variables').split(' ') + print("remove ", remove) + for r in remove: + to_add.remove(r) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) + + print('file', fn) + print('deployment time', time_deploy) + + print('var to add', to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + print("var", v, ' dimensions ', nc_vars[v].dimensions) + + ncVarOut = nc_vars[v + "_quality_control"] + ncVarOut[mask] = 6 + + # create a qc variable just for this test flags + if v + "_quality_control_io" in ds.variables: + ncVarOut = ds.variables[v + "_quality_control_io"] + ncVarOut[:] = 0 + else: + ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" - print(to_add) + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name + try: + ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag" + except AttributeError: + pass - # create a mask for the time range - mask = (time <= time_deploy) | (time >= time_recovery) + ncVarOut.quality_control_conventions = "IMOS standard flags" + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + ncVarOut.comment = 'data flagged not deployed (6) when out of water' - for v in to_add: - if "TIME" in nc_vars[v].dimensions: - if v.endswith("_quality_control"): - print("QC time dim ", v) + ncVarOut[mask] = 6 + # calculate the number of points marked as bad_data + marked = np.zeros_like(ncVarOut) + marked[mask] = 1 + count = sum(marked) - ncVarOut = nc_vars[v] - ncVarOut[mask] = 7 - else: - # create a qc variable just for this test flags - if v + "_quality_control_io" in ds.variables: - ncVarOut = ds.variables[v + "_quality_control_io"] - else: - ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - ncVarOut[:] = np.zeros(nc_vars[v].shape) - ncVarOut.long_name = "quality flag for " + v - ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) - ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + ds.file_version = "Level 1 - Quality Controlled Data" + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" - nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" - ncVarOut[mask] = 7 - - ds.variables[v + "_quality_control"][:] = np.maximum(ds.variables[v + "_quality_control_io"][:],ds.variables[v + "_quality_control"][:]) + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' : ' + ' marked ' + str(int(count))) - ds.file_version = "Level 1 - Quality Controlled Data" + ds.close() - ds.close() + out_file.append(fn) - return netCDFfile + return out_file if __name__ == "__main__": if len(sys.argv) > 2 & sys.argv[1].startswith('-'): - in_out_water(sys.argv[2], var_name=sys.argv[1][1:]) + in_out_water(sys.argv[2:], var_name=sys.argv[1][1:]) else: - in_out_water(sys.argv[1]) \ No newline at end of file + in_out_water(sys.argv[1:]) \ No newline at end of file From fe0ab111fbe2b157127cfada96a2704db54dc9ff Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 11 Jun 2020 16:22:27 +1000 Subject: [PATCH 58/59] Various updates after convo with TT, not complete --- ocean_dp/processing/netcdf_to_df.py | 54 +++++++++- ocean_dp/qc/in_out_water.py | 144 ++++++++++++------------- ocean_dp/qc/qc_checker.py | 7 ++ ocean_dp/sots_processing_runthrough.py | 5 + 4 files changed, 134 insertions(+), 76 deletions(-) diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py index e86b158..4a409a4 100755 --- a/ocean_dp/processing/netcdf_to_df.py +++ b/ocean_dp/processing/netcdf_to_df.py @@ -17,7 +17,8 @@ import pandas as pd import numpy as np from datetime import datetime as dt - +import glob +import re # ============================================================================= # Returns a list of the time series variables in a IMOS format @@ -54,6 +55,7 @@ def netcdf_to_df(target_file): # store deployment times in attributes df.attrs['time_deployment_start'] = nc.time_deployment_start df.attrs['time_deployment_end'] = nc.time_deployment_end + df.attrs['nominal_depth'] = nc.variables['NOMINAL_DEPTH'][0] # extract the column names col_names = list(df.columns) @@ -72,6 +74,8 @@ def netcdf_to_df(target_file): def combine_df(target_dfs): + total_df = pd.DataFrame({'dummy' : []}) + # for each of the dataframes in the list provided for cur_df in target_dfs: @@ -91,8 +95,11 @@ def combine_df(target_dfs): # trim the df to only include in water data df = df.drop(df[(df.index < start_time) | (df.index > end_time)].index) + # remove data with bad qc instead of setting to nan later in process, let resample do the work? + # but what if psal is bad but temp is good?? Need to think on this. + # resamples using the max method, to create a df of the correct dimensions to fill - df_to_fill = df.resample('H',base=0.5).max() + df_to_fill = df.resample('H',base=0.5).min() # gets list of column names @@ -105,7 +112,9 @@ def combine_df(target_dfs): for cur_col in col_names_no_qc: # sets the value of non qc data to nan if the corresponding qc value is not satisfactory (0,1,2,7 at the moment) - df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan + #df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan + # CAUSING UNEXPECTED NANS - FIX + df.loc[df[cur_col+'_quality_control'].isin([3,4,6,9]) , cur_col] = np.nan # extracts the time series data dS = pd.Series(df[cur_col]) @@ -116,7 +125,8 @@ def combine_df(target_dfs): dS_1s[:] = 1 # resamples the series, interpoling linearly - dS_resampled = dS.resample('H',base=0.5).interpolate() + dS_resampled = dS.resample('H',base=0.5).interpolate(method='index',axis=0,limit=1000000) + # count how many data points are in each shoulder bin dS_bin_counts = dS_1s.resample('H',base=0.5).sum() @@ -130,8 +140,44 @@ def combine_df(target_dfs): # shift the timestamps to the middle of the hour sampling period df_to_fill.index = df_to_fill.index + pd.Timedelta('30 min') + total_df = pd.concat([total_df,df_to_fill], join='outer', axis=1) + + print(cur_df) + + print(len(total_df)) + + total_df.drop(['dummy'],axis=1,inplace=True) + + return total_df + +# ============================================================================= +# +# ============================================================================= +def depth_from_file(file_in): + + result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0]) + + return result + +# ============================================================================= +# +# ============================================================================= + +netcdfs = glob.glob('*FV01*.nc') + +netcdfs = sorted(netcdfs,key=depth_from_file) + +df_list = list() + +for cur_netcdf in netcdfs: + + df = netcdf_to_df(cur_netcdf) + + df_list.append(df) + + diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py index 39da3a7..7ec3e4d 100644 --- a/ocean_dp/qc/in_out_water.py +++ b/ocean_dp/qc/in_out_water.py @@ -27,89 +27,89 @@ # flag out of water as QC value 6 (not_deployed), with wise leave as 0 -def in_out_water(netCDFfile, var_name=None): +def in_out_water(fn, var_name=None): out_file = [] - for fn in netCDFfile: - ds = Dataset(fn, 'a') - nc_vars = ds.variables - to_add = [] - if var_name: - to_add.append(var_name) - else: - for v in nc_vars: - if "TIME" in nc_vars[v].dimensions: - #print (vars[v].dimensions) - if v != 'TIME': - to_add.append(v) - # remove any anx variables from the list - for v in nc_vars: - if 'ancillary_variables' in nc_vars[v].ncattrs(): - remove = nc_vars[v].getncattr('ancillary_variables').split(' ') - print("remove ", remove) - for r in remove: - to_add.remove(r) - - time_var = nc_vars["TIME"] - time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) - - time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) - time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) - - print('file', fn) - print('deployment time', time_deploy) - - print('var to add', to_add) - - # create a mask for the time range - mask = (time <= time_deploy) | (time >= time_recovery) - - for v in to_add: - print("var", v, ' dimensions ', nc_vars[v].dimensions) - - ncVarOut = nc_vars[v + "_quality_control"] - ncVarOut[mask] = 6 - - # create a qc variable just for this test flags - if v + "_quality_control_io" in ds.variables: - ncVarOut = ds.variables[v + "_quality_control_io"] - ncVarOut[:] = 0 - else: - ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max - nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ds = Dataset(fn, 'a') + nc_vars = ds.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + if "TIME" in nc_vars[v].dimensions: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + # remove any anx variables from the list + for v in nc_vars: + if 'ancillary_variables' in nc_vars[v].ncattrs(): + remove = nc_vars[v].getncattr('ancillary_variables').split(' ') + print("remove ", remove) + for r in remove: + to_add.remove(r) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) + + print('file', fn) + print('deployment time', time_deploy) + + print('var to add', to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + print("var", v, ' dimensions ', nc_vars[v].dimensions) + + ncVarOut = nc_vars[v + "_quality_control"] + ncVarOut[mask] = 6 + + # create a qc variable just for this test flags + if v + "_quality_control_io" in ds.variables: + ncVarOut = ds.variables[v + "_quality_control_io"] ncVarOut[:] = 0 - ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name - try: - ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag" - except AttributeError: - pass - - ncVarOut.quality_control_conventions = "IMOS standard flags" - ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) - ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' - ncVarOut.comment = 'data flagged not deployed (6) when out of water' - - ncVarOut[mask] = 6 - # calculate the number of points marked as bad_data - marked = np.zeros_like(ncVarOut) - marked[mask] = 1 - count = sum(marked) - - ds.file_version = "Level 1 - Quality Controlled Data" - # update the history attribute + else: + ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name try: - hist = ds.history + "\n" + ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag" except AttributeError: - hist = "" + pass + + ncVarOut.quality_control_conventions = "IMOS standard flags" + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + ncVarOut.comment = 'data flagged not deployed (6) when out of water' + + ncVarOut[mask] = 6 + # calculate the number of points marked as bad_data + marked = np.zeros_like(ncVarOut) + marked[mask] = 1 + count = sum(marked) + + ds.file_version = "Level 1 - Quality Controlled Data" + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" - ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' : ' + ' marked ' + str(int(count))) + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' : ' + ' marked ' + str(int(count))) - ds.close() + ds.close() - out_file.append(fn) + out_file.append(fn) return out_file diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py index 90e83e8..6fd7881 100755 --- a/ocean_dp/qc/qc_checker.py +++ b/ocean_dp/qc/qc_checker.py @@ -106,6 +106,13 @@ def qc_checker(nc,target_vars_in=[]): # The qc process has succeeded qc_behaving = True + + # sets all data with a qc value of 0 to have a qc value of 1, having passed all the tests + nc.variables[current_var+"_quality_control"][qc_global_data==0] = 1 + + now=datetime.utcnow() + + nc.history += ' ' + now.strftime("%Y%m%d:") + 'passed qc_checker, all qc=0 set to qc=1' # Returns true if qc has succeeded, false if not return qc_behaving diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py index c389d27..f9aee09 100755 --- a/ocean_dp/sots_processing_runthrough.py +++ b/ocean_dp/sots_processing_runthrough.py @@ -62,6 +62,11 @@ # Run pressure_interpolator.py fv01_pres_interp_files = pressure_interpolator.pressure_interpolator(netCDFfiles=fv01_files,agg=glob.glob('*IMOS_ABOS-SOTS*Aggregate*.nc')[0]) +# delete the defunct FV01 files +for ncfile in fv01_files: + + os.remove(ncfile) + # Global range test for ncfile in fv01_pres_interp_files: From 0ed9243197a720947016359c99d967f0a9b88d13 Mon Sep 17 00:00:00 2001 From: bweeding <57697604+bweeding@users.noreply.github.com> Date: Thu, 25 Jun 2020 14:24:05 +1000 Subject: [PATCH 59/59] Create add_mld.py --- ocean_dp/processing/add_mld.py | 101 +++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100755 ocean_dp/processing/add_mld.py diff --git a/ocean_dp/processing/add_mld.py b/ocean_dp/processing/add_mld.py new file mode 100755 index 0000000..112ce35 --- /dev/null +++ b/ocean_dp/processing/add_mld.py @@ -0,0 +1,101 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset, num2date +import sys +from datetime import datetime as dt +import numpy as np +import pandas as pd +from scipy import interpolate + +def add_mld(nc_in,thresh_in=0.2): + + # opens the supplied IMOS netcdf + nc = Dataset(nc_in,'a') + + temp_na = np.array(nc.variables['TEMP']) + + # create two nan filled arrays the length of the FV02 file, one for the mld and one for its uncertainty + nc_mld = np.full([1,temp_na.shape[1]], np.nan)[0] + + nc_mld_uncert = np.full([1,temp_na.shape[1]], np.nan)[0] + + # temp sensor depths + nc_temp_depths = np.array(nc.variables['DEPTH_TEMP']) + + temp_na = temp_na[nc_temp_depths>5,:] + + nc_temp_depths = nc_temp_depths[nc_temp_depths>5] + + # boolean of sensors at the shallowest depth + shallowest_sensors = nc_temp_depths == np.min(nc_temp_depths) + + # for each temperature profile where there is at least one non NaN value in the shallowest sensors + for i in np.where(~np.all(np.isnan(temp_na[shallowest_sensors]),axis=0))[0]: + + # check there is at least one non NaN value in the deeper sensors + if np.any(~np.isnan(temp_na[~shallowest_sensors,i])): + + # calculates the mean temperature of the available shallowest sensors to use as a reference to calculate MLD + shallow_temp = np.nanmean(temp_na[shallowest_sensors,i]) + + # extract temperature and depth data using a mean for the shallowest depth, and all non NaN data below + profile_temps = np.append(shallow_temp,temp_na[~shallowest_sensors,i][~np.isnan(temp_na[~shallowest_sensors,i])]) + + profile_depths = np.append(nc_temp_depths[0],nc_temp_depths[~shallowest_sensors][~np.isnan(temp_na[~shallowest_sensors,i])]) + + # check if the current profile contains any temperatures outside the specified threshold values + if np.any(temp_na[~shallowest_sensors,i]>=shallow_temp+thresh_in) or np.any(temp_na[~shallowest_sensors,i]<=shallow_temp-thresh_in): + + # generate a linear interpolator for the profile, which returns nan if extrapolation is attempted + profile_interp_func = interpolate.interp1d(profile_temps,profile_depths,bounds_error=False,fill_value=np.nan) + + # finds the shallowest depth at which the linear interpolation of the profile meets a threshold limit + nc_mld[i] = np.nanmin(profile_interp_func([shallow_temp+thresh_in,shallow_temp-thresh_in])) + + # provides an estimate of uncertainty, by giving the distance to the furthest sensor used to interpolate the MLD + nc_mld_uncert[i] = np.max([np.abs(nc_mld[i]-[x for x in profile_depths if x < nc_mld[i]][-1]),np.abs(nc_mld[i]-next(x for x in profile_depths if x > nc_mld[i]))]) + + + # if none of the sensors are outside the threshold + else: + + # set the mld to the depth of the deepest non NaN sensor + nc_mld[i] = np.max(profile_depths) + + # set the uncertainty to the distance between the sensor and the bottom + nc_mld_uncert[i] = 4600 - nc_mld[i] + + # create the two variables + mld_var_out = nc.createVariable('MLDx', "f4", ("TIME",), fill_value=np.nan, zlib=True) + mld_var_out[:] = nc_mld + mld_var_out.units = 'm' + mld_var_out.comment = 'Calculated using the linear interpolation MLD algorithm found at: INSERT GITHUB ADDRESS' + + mld_uncert_var_out = nc.createVariable('MLDx_standard_error', "f4", ("TIME",), fill_value=np.nan, zlib=True) + mld_uncert_var_out[:] = nc_mld_uncert + mld_uncert_var_out.units = 'm' + + nc.close() + + + + + + + + + + \ No newline at end of file