diff --git a/matlab/add_interp_press.m b/matlab/add_interp_press.m new file mode 100644 index 0000000..5ba9817 --- /dev/null +++ b/matlab/add_interp_press.m @@ -0,0 +1,275 @@ +% SOTS Pressure interpolator + +% This code imports pressure data from an aggregated file (constructed by +% P.Jansen), and creates interpolated pressure records for FV00 raw +% instrument files - firstly by interpolating along time series of pressure +% readings in the aggregate file to find pressures at each time in a +% particular FV00 file, and secondly by interpolating down nominal depths at +% each timestamp to find a pressure value for each FV00 timestamp. + +% Ben Weeding - ben.weeding.26@gmail.com + +%% Load the filenames + +fv00_files = dir('*FV00*.nc'); +agg_files = dir('*Aggregate*.nc'); + +%% Load the pressure data + +agg_pres = ncread(agg_files.name,'PRES'); +agg_pres_info = ncinfo(agg_files.name, 'PRES'); +agg_instrument_index = ncread(agg_files.name,'instrument_index'); +agg_nominal_depth = ncread(agg_files.name,'NOMINAL_DEPTH'); +agg_time = ncread(agg_files.name,'TIME'); + +% Here we prevent the use of bad data from Pulse 8 + +if strfind(fv00_files(1).name,'Pulse-8') + + agg_pres(agg_instrument_index==2 & agg_time+datenum(1950,1,1,0,0,0) >= datenum('30-01-2012 05:00','dd-mm-yyyy HH:MM'))=NaN; + + %agg_pres(agg_instrument_index==2)=NaN; + +end + + +%% Interpolate the pressure and write the data into the FV00 file + +% Loop through each of the fv00 files +for i=1:length(fv00_files) + + disp(fv00_files(i).name) + + % Extract the content from the FV00 file + fv00_contents = ncinfo(fv00_files(i).name); + + % Check if the FV00 file contains pressure data, run the interpolation + % code if not + + if (sum(contains({fv00_contents.Variables(:).Name}, 'PRES')) == 0) + + % Load the FV00 data requiring pressure + %'days since 1950-01-01 00:00:00 UTC' for minilog T + fv00_time = ncread(fv00_files(i).name,'TIME'); + fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH'); + + % Interpolate the agg pressure records at each nominal depth to + % provide pressure values at each timestamp in the current FV00 + % file + + interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time)); + + % Include a row of zeros to set surface depth as 0 dbar + + interp_agg_pres(1,:) = zeros(size(fv00_time)); + + agg_nominal_depth_with_0 = [0; agg_nominal_depth]; + + % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time + for j = 1:(length(agg_nominal_depth)) + + % Select the relevant time and pressures + + time_selection = agg_time(agg_instrument_index == (j-1)); + pres_selection = agg_pres(agg_instrument_index == (j-1)); + + % Interpolate along each nominal depth + + interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time); + end + + % Sort the nominal depths and pressures + + [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0); + + interp_agg_pres = interp_agg_pres(sort_idx,:); + + + % Linearly interpolate at each timestamp to replace NaN values + + interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0); + + + % At each timestamp in the FV00 record, interpolate a pressure + % value based on the FV00 nominal depth, and the interpolated + % pressures in interp_agg_pres. + pres_interp_dummy = nan(size(fv00_time)); + + + for l = 1:length(fv00_time) + + if sum(~isnan(interp_agg_pres(:,l))) > 1 + + pres_interp_dummy(l) = interp1(agg_nominal_depth_with_0,interp_agg_pres(:,l),fv00_depth); + + end + + end + + pres_interp = pres_interp_dummy; + + % Create an FV01 version of the current FV00 file + + % Create the new FV01 file name + + fv01_name = strrep(fv00_files(i).name,'FV00','FV01'); + fv01_name(end-10:end-3)=datestr(now,'yyyymmdd'); + + % Write the FV00 data into the FV01 file + ncwriteschema(fv01_name, fv00_contents); + + % copy variable data to new file + for v = fv00_contents.Variables + ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name)); + end + + % Modify the global attributes of the file to record processing, + % and add to the file history + + ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed'); + hist = ncreadatt(fv00_files(i).name, '/', 'history'); + ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Added interpolated pressure from ' agg_files.name]); + + % Add and populate a PRES variable to the FV01 file + nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(pres_interp,1)}, 'FillValue',NaN); + ncwrite(fv01_name, 'PRES', pres_interp); + + % Add quality control variables to the FV01 file, assigning 8 to + % interpolated data in line with Argo + for v = fv00_contents.Variables + + if ~isempty(v.Dimensions) + + nccreate(fv01_name, v.Name + "_quality_control",'Dimensions',{v.Dimensions.Name,v.Dimensions.Length},'FillValue',99); + + ncwriteatt(fv01_name,v.Name + "_quality_control",'long_name',"quality_code for"+v.Name); + + ncwriteatt(fv01_name,v.Name,'ancillary_variables',v.Name + "_quality_control"); + + if contains(v.Name,'PRES') + + ncwrite(fv01_name, v.Name + "_quality_control",8*ones(size(fv00_time))); + + end + + end + + end + + + + % copy attributes from agg file to output file + pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file + for k=1:length(pres_atts) + if (strcmp(pres_atts(k).Name, '_FillValue') == 0) + ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value); + end + end + + % Add the relevant attributes to the PRES variable, including a + % comment noting that the data has been linearly interpolated + ncwriteatt(fv01_name, 'PRES', 'comment','pressure data has been interpolated from surrounding pressure sensors'); + + else + + % Load the FV00 data containing pressure + %'days since 1950-01-01 00:00:00 UTC' for minilog T + fv00_time = ncread(fv00_files(i).name,'TIME'); + fv00_depth = ncread(fv00_files(i).name,'NOMINAL_DEPTH'); + fv00_pres = ncread(fv00_files(i).name,'PRES'); + + % Remove bad data in pulse 8 + + if strfind(fv00_files(i).name,'Pulse-8-2011-SBE16plusV2-01606330-34m') + + fv00_pres(4442:end) = NaN; + + end + + % Interpolate the agg pressure records at each nominal depth to + % provide pressure values at each timestamp in the current FV00 + % file + + interp_agg_pres = nan(length(agg_nominal_depth)+1,length(fv00_time)); + + % Include a row of zeros to set surface depth as 0 dbar + + interp_agg_pres(1,:) = zeros(size(fv00_time)); + + agg_nominal_depth_with_0 = [0; agg_nominal_depth]; + + % Loop through each nominal depth in the aggregate file, and get pressure for the FV00 file's time + for j = 1:(length(agg_nominal_depth)) + + % Select the relevant time and pressures + + time_selection = agg_time(agg_instrument_index == (j-1)); + pres_selection = agg_pres(agg_instrument_index == (j-1)); + + % Interpolate along each nominal depth + + interp_agg_pres(j+1,:) = interp1(time_selection,pres_selection,fv00_time); + end + + % Sort the nominal depths and pressures + + [agg_nominal_depth_with_0,sort_idx] = sort(agg_nominal_depth_with_0); + + interp_agg_pres = interp_agg_pres(sort_idx,:); + + + % Linearly interpolate at each timestamp to replace NaN values + + interp_agg_pres = fillmissing(interp_agg_pres,'linear','SamplePoints',agg_nominal_depth_with_0); + + for j = 1:length(fv00_pres) + + if isnan(fv00_pres(j)) + + fv00_pres(j) = interp_agg_pres(agg_nominal_depth_with_0==fv00_depth,j); + + end + + end + + % Create an FV01 version of the current FV00 file + + % Create the new FV01 file name + + fv01_name = strrep(fv00_files(i).name,'FV00','FV01'); + fv01_name(end-10:end-3)=datestr(now,'yyyymmdd'); + + % Write the FV00 data into the FV01 file + ncwriteschema(fv01_name, fv00_contents); + + % copy variable data to new file + for v = fv00_contents.Variables + ncwrite(fv01_name, v.Name, ncread(fv00_files(i).name, v.Name)); + end + + % Modify the global attributes of the file to record processing, + % and add to the file history + + ncwriteatt(fv01_name,'/','file_version','Level 1 - partially processed'); + hist = ncreadatt(fv00_files(i).name, '/', 'history'); + ncwriteatt(fv01_name,'/','history',[hist newline datestr(now,'yyyy-mm-dd') ' : Filled missing pressure with interpolated pressure from ' agg_files.name]); + + % Add and populate a PRES variable to the FV01 file + %nccreate(fv01_name, 'PRES', 'Dimensions',{'TIME',size(fv00_pres,1)}, 'FillValue',NaN); + ncwrite(fv01_name, 'PRES', fv00_pres); + + % copy attributes from agg file to output file + pres_atts = agg_pres_info.Attributes; % get all attribtes from the aggregate file + for k=1:length(pres_atts) + if (strcmp(pres_atts(k).Name, '_FillValue') == 0) + ncwriteatt(fv01_name, 'PRES', pres_atts(k).Name, pres_atts(k).Value); + end + end + + % Add the relevant attributes to the PRES variable, including a + % comment noting that the data has been linearly interpolated + ncwriteatt(fv01_name, 'PRES', 'comment','originally missing pressure data has been interpolated from surrounding pressure sensors'); + + + end +end \ No newline at end of file diff --git a/ocean_dp/aggregation/copyDataset.py b/ocean_dp/aggregation/copyDataset.py index 0079d13..54eb960 100644 --- a/ocean_dp/aggregation/copyDataset.py +++ b/ocean_dp/aggregation/copyDataset.py @@ -15,14 +15,13 @@ # similar more general tool project https://ncagg.readthedocs.io/en/latest/ (does not work on python3 2019-10-01) # has configurable way of dealing with attributes - # file sets to test against # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Temperature/catalog.html # http://thredds.aodn.org.au/thredds/catalog/IMOS/ANMN/NRS/NRSKAI/Biogeochem_profiles/catalog.html # http://thredds.aodn.org.au/thredds/catalog/IMOS/ABOS/DA/EAC2000/catalog.html from dateutil.parser import parse - + def aggregate(files, varNames): # split this into createCatalog - copy needed information into structure @@ -39,7 +38,10 @@ def aggregate(files, varNames): # look over all files, create a time array from all files # TODO: maybe delete files here without variables we're not interested in # TODO: Create set of variables in all files - + if not isinstance(varNames, list): + + varNames = [varNames] + filen = 0 for path_file in files: @@ -244,8 +246,8 @@ def aggregate(files, varNames): filen = 0 - # variables we want regardless - varNames += ['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH'] + # variables we want regardless + varNames.extend(['LATITUDE', 'LONGITUDE', 'NOMINAL_DEPTH']) # remove any duplicates varNamesOut = set(varNames) @@ -344,7 +346,7 @@ def aggregate(files, varNames): dMin = maVariableAll.max(0) ncOut.setncattr("geospatial_vertical_max", dMax) ncOut.setncattr("geospatial_vertical_min", dMin) - + dsIn.close() # we're done with the varList now ncOut.close() @@ -352,56 +354,56 @@ def aggregate(files, varNames): return outputName -def collect_vars_to_agg(files): +# def collect_vars_to_agg(files): - var_list = [] +# var_list = [] - nc = Dataset(files[0]) - varList = nc.variables +# nc = Dataset(files[0]) +# varList = nc.variables - # default to all variables in first file should no variable be specified - var_list.extend(varList.keys()) - var_list.remove("TIME") +# # default to all variables in first file should no variable be specified +# var_list.extend(varList.keys()) +# var_list.remove("TIME") - nc.close() +# nc.close() - print("collect_vars_to_agg::", var_list) +# print("collect_vars_to_agg::", var_list) - return var_list +# return var_list -if __name__ == "__main__": +# if __name__ == "__main__": - files = [] - varToAgg = None # defaults to all in first file +# files = [] +# varToAgg = None # defaults to all in first file - if len(sys.argv) > 1: - parser = argparse.ArgumentParser() - parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)') - parser.add_argument('-f', dest='filelist', help='read file names from file') - parser.add_argument('file', nargs='*', help='input file name') - args = parser.parse_args() +# if len(sys.argv) > 1: +# parser = argparse.ArgumentParser() +# parser.add_argument('-v', action='append', dest='var', help='variable to include in output file (defaults to all)') +# parser.add_argument('-f', dest='filelist', help='read file names from file') +# parser.add_argument('file', nargs='*', help='input file name') +# args = parser.parse_args() - if not isinstance(args.filelist, type(None)): - with open(args.filelist, "r") as ins: - for line in ins: - print(line) - files.append(line.strip()) +# if not isinstance(args.filelist, type(None)): +# with open(args.filelist, "r") as ins: +# for line in ins: +# print(line) +# files.append(line.strip()) - if len(args.file): - # files = args.file - for fn in args.file: - files.extend(glob.glob(fn)) +# if len(args.file): +# # files = args.file +# for fn in args.file: +# files.extend(glob.glob(fn)) - varToAgg = args.var +# varToAgg = args.var - if isinstance(varToAgg, type(None)): - varToAgg = collect_vars_to_agg(files) +# if isinstance(varToAgg, type(None)): +# varToAgg = collect_vars_to_agg(files) - print("Aggregating variables ", varToAgg) +# print("Aggregating variables ", varToAgg) - outputName = aggregate(files, varToAgg) +# outputName = aggregate(files, varToAgg) - print("Output file : %s" % outputName) +# print("Output file : %s" % outputName) diff --git a/ocean_dp/file_name/find_file_with.py b/ocean_dp/file_name/find_file_with.py new file mode 100755 index 0000000..59225b3 --- /dev/null +++ b/ocean_dp/file_name/find_file_with.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import glob + +import sys +import re + +from netCDF4 import Dataset + + +def find_files_pattern(file_pattern): + match_files = [] + files = glob.glob(file_pattern) + + match_files.extend(files) + return match_files + +def find_global(files, attribute, regexp): + + match_files = [] + #print("find", file_pattern, files) + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + if attribute in ds.ncattrs(): + if re.match(regexp, ds.getncattr(attribute)): + match_files.append(f) + ds.close() + + return match_files + + +def find_variable(files, variable): + + match_files = [] + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + if variable in ds.variables: + match_files.append(f) + ds.close() + + return match_files + + +def find_variable_attribute(files, attribute, value): + + match_files = [] + for f in files: + #print("check file", f) + ds = Dataset(f, 'r') + nv = {attribute : value} + find = ds.get_variables_by_attributes(**nv) + if len(find) > 0: + match_files.append(f) + ds.close() + + return match_files + + +if __name__ == "__main__": + fns = [] + if sys.argv[1] == '-v': + files = find_files_pattern(sys.argv[3]) + fns = find_variable(files, variable=sys.argv[2]) + elif sys.argv[1] == '-a': + files = find_files_pattern(sys.argv[4]) + fns = find_variable_attribute(files, attribute=sys.argv[2], value=sys.argv[3]) + else: + files = find_files_pattern(sys.argv[4]) + fns = find_global(files, attribute=sys.argv[1], regexp=sys.argv[2]) + + for f in fns: + print(f) \ No newline at end of file diff --git a/ocean_dp/plotting/batch-qc.pdf b/ocean_dp/plotting/batch-qc.pdf new file mode 100644 index 0000000..5e7a87b Binary files /dev/null and b/ocean_dp/plotting/batch-qc.pdf differ diff --git a/ocean_dp/plotting/density_plot.py b/ocean_dp/plotting/density_plot.py new file mode 100755 index 0000000..a798b02 --- /dev/null +++ b/ocean_dp/plotting/density_plot.py @@ -0,0 +1,25 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import sys +import os + +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/file_name') + +import find_file_with + +path = "/Users/Tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data" + +sots_files = find_file_with.find_files_pattern(os.path.join(path, "IMOS*FV00*.nc")) \ No newline at end of file diff --git a/ocean_dp/plotting/panda_maker.py b/ocean_dp/plotting/panda_maker.py new file mode 100755 index 0000000..2fc9bfb --- /dev/null +++ b/ocean_dp/plotting/panda_maker.py @@ -0,0 +1,128 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +import warnings +import scipy.stats as st +import statsmodels as sm +import matplotlib + +# this function creates a pandas Datatable object, searching through all the +# netcdf files in the directory given, containing all the variables specified + +# "/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data" + +# ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"] + +# qc selection!!! + +def panda_maker(dir_spec,var_list,qc_lim=2): + + # creates an empty array to store the names of the SOTS deployments + deployments = [] + + checked_files = [] + + processed_files = [] + + # loops through all the folders and files contained in the folder + for x in os.listdir(dir_spec): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): + + deployments.append(x) + + + + # create a dataframe to store extract information + total_df = pd.DataFrame(columns = var_list) + + # add deployment code to the dataframe + total_df.insert(len(var_list),'Deployment code',[]) + + # loops through all files in the directory + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'r') + + # check file contains all the specified variables and the time format is correct + if (all(ele in list(nc.variables) for ele in var_list)) & (nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC'): + + # create a current dataframe for the netcdf file, to be appended to the overall dataframe + cur_df = pd.DataFrame(columns=var_list) + + # create a qc vector for the netcdf file + cur_qc = np.zeros(nc.variables["TIME"].shape) + + for cur_var in var_list: + + if np.array(nc.variables[cur_var]).size == 1: + + filling = np.ones(nc.variables["TIME"].shape) * np.array(nc.variables[cur_var]) + + else: + + filling = np.array(nc.variables[cur_var][:]) + + if cur_var + '_quality_control' in list(nc.variables): + + cur_qc = np.maximum(cur_qc,np.array(nc.variables[cur_var + '_quality_control'])) + + + cur_df[cur_var] = filling + + cur_df['Deployment code'] = [nc.deployment_code] * len(np.array(nc.variables['TIME'])) + + # append the current netcdf's dataframe to the sots_temp_ensemble + total_df = total_df.append(cur_df.iloc[np.where(cur_qc<=qc_lim)]) + + # append the filename to the list of processed files + processed_files.append(fname) + + + nc.close() + + total_df = total_df.reset_index(drop=True) + + + return total_df + \ No newline at end of file diff --git a/ocean_dp/plotting/plotNetCDFmultiqc.py b/ocean_dp/plotting/plotNetCDFmultiqc.py new file mode 100755 index 0000000..9442a56 --- /dev/null +++ b/ocean_dp/plotting/plotNetCDFmultiqc.py @@ -0,0 +1,340 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset +from netCDF4 import num2date +import datetime as dt +import numpy as np +import matplotlib + +matplotlib.use('Agg') + +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import sys +import os +from matplotlib import rc + +# rc('text', usetex=True) + +for path_file in sys.argv[1:len(sys.argv)]: + + nc = Dataset(path_file) + + # get time variable + nctime = nc.variables['TIME'][:] + t_unit = nc.variables['TIME'].units # get unit "days since 1950-01-01T00:00:00Z" + + try: + t_cal = nc.variables['TIME'].calendar + except AttributeError: # Attribute doesn't exist + t_cal = u"gregorian" # or standard + + dt_time = [num2date(t, units=t_unit, calendar=t_cal) for t in nctime] + + # work out variables to plot + nc_vars_to_plot = [var for var in nc.variables] + + # remove any dimensions from the list to plot + nc_dims = [dim for dim in nc.dimensions] # list of nc dimensions + + for i in nc_dims: + try: + nc_vars_to_plot.remove(i) + except ValueError: + print('did not remove ', i) + + # remove an auxiliary variables from the list to plot + aux_vars = list() + for var in nc.variables: + try: + aux_vars.append(nc.variables[var].getncattr('ancillary_variables')) + except AttributeError: + pass + + # remove any variables without a TIME dimension from the list to plot + to_plot = list() + + for var in nc.variables: + # print var + if var in nc_dims: + continue + if var in aux_vars: + continue + if 'TIME' in nc.variables[var].dimensions: + print('to plot ', var) + to_plot.append(var) + + # pdffile = path_file[path_file.rfind('/')+1:len(path_file)] + '-' + nc.getncattr('deployment_code') + '-plot.pdf' + + pdffile = path_file + '.pdf' + + pp = PdfPages(pdffile) + + txt = "" + lines = 0 + plt.figure(figsize=(11.69, 8.27)) + + txt = 'file name : ' + os.path.basename(path_file) + '\n\n' + + txt += 'Dimensions:\n' + for x in nc.dimensions: + txt += ' ' + x + ' (' + str(nc.dimensions[x].size) + ')\n' + + txt += '\nVariables:\n' + for x in nc.variables: + v_atts = nc.variables[x] + var_line = ' ' + x + ' ' + str(v_atts.dimensions) + + try: + var_line += ' : long_name = ' + v_atts.long_name + except AttributeError: + pass + try: + var_line += ' (' + v_atts.units + ')' + except AttributeError: + pass + var_line += ' : type ' + str(v_atts.datatype) + + print(var_line) + + lines = txt.count('\n') + var_line.count('\n') + # print("lines ", lines) + if lines > 57: + #print(txt) + print('new page') + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + plt.figure(figsize=(11.69, 8.27)) + + txt = "" + + lines = 0 + + txt += var_line + '\n' + + plt.figure(figsize=(11.69, 8.27)) + + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + + txt = "" + plt.figure(figsize=(11.69, 8.27)) + + lines = 0 + # print "NetCDF Global Attributes:" + for nc_attr in sorted(nc.ncattrs(), key=lambda s: s.lower()): + #print('\t%s:' % nc_attr, repr(nc.getncattr(nc_attr))) + attrib_txt = nc_attr + ' : ' + str(nc.getncattr(nc_attr)).replace('\n', '\n ') + '\n' + lines = txt.count('\n') + attrib_txt.count('\n') + # print("lines ", lines) + if lines > 57: + #print(txt) + print('new page') + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + plt.figure(figsize=(11.69, 8.27)) + + txt = "" + + lines = 0 + + txt += attrib_txt + + lines += 1 + + #print(txt) + plt.text(-0.1, -0.1, txt, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig() + plt.close() + + # plot each variable in the to_plot list + for plot in to_plot: + + plot_var = nc.variables[plot] + + var = plot_var[:] + shape_len = len(var.shape) + + # create a page with information about the variable, and any aux variables + fig = plt.figure(figsize=(11.69, 8.27)) + + text = "Variable : " + plot_var.name + str(plot_var.dimensions) + "\n" + nc_attrs = plot_var.ncattrs() + # print "NetCDF Variable Attributes:" + for nc_attr in nc_attrs: + attrVal = plot_var.getncattr(nc_attr) + #print('\t%s:' % nc_attr, repr(plot_var.getncattr(nc_attr)), type(attrVal)) + text += nc_attr + ' : ' + str(attrVal) + '\n' + + if hasattr(plot_var, 'ancillary_variables'): + qc_var_name = plot_var.getncattr('ancillary_variables') + qc_var = nc.variables[qc_var_name] + + text += "\nAUX : " + qc_var.name + str(qc_var.dimensions) + "\n" + + nc_attrs = qc_var.ncattrs() + # print "NetCDF AUX Variable Attributes:" + for nc_attr in nc_attrs: + # print '\t%s:' % nc_attr, repr(nc.getncattr(nc_attr)) + text += nc_attr + ' : ' + str(qc_var.getncattr(nc_attr)) + '\n' + + qc = nc.variables[qc_var_name][:] + + if plot_var.dimensions[0] != 'TIME': + qc = np.transpose(qc) + + qc = np.squeeze(qc) + else: + qc = 0 + + plt.text(-0.1, 0.0, text, fontsize=8, family='monospace') + plt.axis('off') + pp.savefig(fig) + plt.close(fig) + + print(plot_var.name, " shape ", var.shape, " len ", shape_len) + + # now create a page with the plot + + fig = plt.figure(figsize=(11.69, 8.27)) + ax = plt.subplot(111) + + if plot_var.dimensions[0] != 'TIME': + var = np.transpose(var) + var = np.squeeze(var) + + # create range from only good data + qc_m = np.ma.masked_where((qc == 9) | (qc == 4) | (qc == 6), var) + mx = qc_m.max() + mi = qc_m.min() + + marg = (mx - mi) * 0.1 + print("max ", mx, " min ", mi) + + plt.ylim([mi - marg, mx + marg]) + + # create a legend entry made from serial_number and depth + if hasattr(plot_var, 'sensor_serial_number'): + sn = plot_var.getncattr('sensor_serial_number').split('; ') + elif hasattr(plot_var, 'sensor_serial_number'): + sn = nc.getncattr('instrument_serial_number').split('; ') + else: + sn = 'not found' + + if hasattr(plot_var, 'sensor_depth'): + dpth = plot_var.getncattr('sensor_depth').split('; ') + elif hasattr(plot_var, 'sensor_height'): + dpth = plot_var.getncattr('sensor_height').split('; ') + elif hasattr(nc, 'instrument_nominal_depth'): + dpth = str(nc.getncattr('instrument_nominal_depth')).split('; ') + else: + dpth = 'unknown' + + print("depth ", dpth) + + leg = [x + ' (' + y + ' m)' for x, y in zip(sn, dpth)] + + # if less than 200 points plot with a dot and line + plot_marks = '-' + if len(dt_time) < 200: + plot_marks = '.-' + + pl = ax.plot(dt_time, qc_m, plot_marks) + + # mark qc>2 with yellow dot, qc>3 with red dot + qc_m = np.ma.masked_where((qc <= 2) | (qc == 8), var) + ax.plot(dt_time, qc_m, 'yo') + qc_m = np.ma.masked_where((qc <= 3) | (qc == 8), var) + ax.plot(dt_time, qc_m, 'ro') + + # shrink the plot some + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) + + # add legend below plot + #plt.legend(iter(pl), leg, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=5) + + plt.legend(iter(pl), leg, bbox_to_anchor=(0.0, -0.2, 1.0, -0.15), loc=3, ncol=6, mode="expand", borderaxespad=0.0, fontsize='x-small') + + # invert the yaxis if the units are dbar + try: + if plot_var.units == 'dbar': + plt.gca().invert_yaxis() + except AttributeError: + pass + try: + if plot_var.positive == 'down': + plt.gca().invert_yaxis() + except AttributeError: + pass + + #fig.autofmt_xdate() + plt.grid() + + # add deployment/instrument/standard name as title + + # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.sensor_name + ' ' + \ + # plot_var.sensor_serial_number + ' : ' + plot_var.name, fontsize=10) + + # plt.title(nc.getncattr('deployment_code') + ' : ' + plot_var.getncattr('name'), fontsize=10) + try: + plt.title(nc.getncattr('deployment_code'), fontsize=10) + except AttributeError: + pass + + # add units to Y axis + try: + plt.ylabel(plot + ' (' + plot_var.units + ')') + except AttributeError: + pass + + date_time_start = None + date_time_end = None + + # plot only the time of deployment + try: + date_time_start = dt.datetime.strptime(nc.getncattr('time_coverage_start'), '%Y-%m-%dT%H:%M:%SZ') + date_time_end = dt.datetime.strptime(nc.getncattr('time_coverage_end'), '%Y-%m-%dT%H:%M:%SZ') + except AttributeError: + pass + try: + date_time_start = dt.datetime.strptime(nc.getncattr('time_deployment_start'), '%Y-%m-%dT%H:%M:%SZ') + date_time_end = dt.datetime.strptime(nc.getncattr('time_deployment_end'), '%Y-%m-%dT%H:%M:%SZ') + except AttributeError: + pass + + if date_time_start: + plt.xlim(date_time_start, date_time_end) + + # plt.savefig(plot + '.pdf') + pp.savefig(fig, papertype='a4') + plt.close(fig) + + # plt.show() + + pp.close() + + nc.close() diff --git a/ocean_dp/plotting/plotQC.py b/ocean_dp/plotting/plotQC.py new file mode 100755 index 0000000..979976a --- /dev/null +++ b/ocean_dp/plotting/plotQC.py @@ -0,0 +1,88 @@ +#!/usr/bin/python3 + +# raw2netCDF +# Copyright (C) 2019 Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import xarray as xr + +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages +import seaborn as sns + +from pandas.plotting import register_matplotlib_converters + +import sys +import os + + +def plot_all(files): + register_matplotlib_converters() + plt.style.use('seaborn-darkgrid') + sns.set_context("paper") + + pp = PdfPages(os.path.join(os.path.dirname(files[0]), "batch-qc.pdf")) + + for f in files: + print("file ", f) + do_plot(f) + + pp.savefig() + plt.close() + + pp.close() + + +def plot(fn): + register_matplotlib_converters() + plt.style.use('seaborn-darkgrid') + sns.set_context("paper") + + pp = PdfPages(fn + "-qc.pdf") + + do_plot(fn) + + pp.savefig() + pp.close() + plt.close() + + +def do_plot(fn): + + #fn = sys.argv[1] + #fn = '/Users/pete/cloudstor/SOTS-Temp-Raw-Data/SOFS-7.5-2018/netCDF/IMOS_ABOS-SOTS_TIP_20180801_SOFS_FV01_SOFS-7.5-2018-Starmon-mini-4047-40m_END-20190331_C-20200429.nc' + + DS = xr.open_dataset(fn) + + ax1 = plt.subplot(2, 1, 1) + plt.plot(DS.TIME, DS.TEMP) + plt.title(DS.deployment_code + " - " + DS.instrument_model + ":" + DS.instrument_serial_number + " @ " + str(DS.instrument_nominal_depth), {'fontsize': 8}) + + ax2 = plt.subplot(2, 1, 2, sharex=ax1) + aux = DS.TEMP.ancillary_variables + a_vars = aux.split(" ") + for f in sorted(set(a_vars)): + print('aux var', f) + varn = f.split("_") + plt.plot(DS.TIME, DS.variables[f], label=varn[-1]) + plt.ylim(0, 9) + + plt.legend(prop={'size': 6}) + + DS.close() + + +if __name__ == "__main__": + plot_all(sys.argv[1:]) \ No newline at end of file diff --git a/ocean_dp/processing/addCO2.py b/ocean_dp/processing/addCO2.py new file mode 100755 index 0000000..8a14af8 --- /dev/null +++ b/ocean_dp/processing/addCO2.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jan 21 14:58:20 2020 + +@author: tru050 +""" + + + + +from netCDF4 import Dataset +import sys +import gsw +import numpy as np +from datetime import datetime +import pandas + +# addCO2 takes a SOTS FV02 gridded product netCDFfile as an input, and adds +# CO2 data (delivered from NOAA in a csv file) to the netCDFfile +def addCO2(netCDFfile): + + # Import the SOTS netcdf + ds = Dataset(netCDFfile, 'a') + + # Extract the time variable, in serial date numbers (days since 01/01/1950) + var_time = ds.variables["TIME"] + + # Convert the variable object to an array + netcdf_serials = np.array(var_time[:]) + + # Read in the CO2 csv file, ignoring the first five rows + dcsv = pandas.read_csv('SOFS_prelimdata_Nov2019test.csv',header=5) + + # Convert the dataframe to an array + dc = dcsv.to_numpy() + + csv_dates = [] + + # Create a list of datetimes from the csv + for i in range(len(dc)): + + csv_dates.append(datetime.strptime(dc[i,0],'%m/%d/%Y %H:%M')) + + # Calculate the difference between the csv dates and 01/01/1950 in order + # to convert them to the serial date format of the netcdf + time_offset_1950 = datetime(1950,1,1,0,0,0) + + csv_delta= [] + + for i in range(len(dc)): + + csv_delta.append(csv_dates[i] - time_offset_1950) + + + # Convert the datetimes from the csv into an array of serial date numbers + csv_serials = [] + + for i in range(len(dc)): + + csv_serials.append(csv_delta[i].days + csv_delta[i].seconds/86400) + + csv_serials = np.array(csv_serials) + + # Find the indices of timestamps of the csv file that are in the deployment + # period of the netcdf file + matching_index = (netcdf_serials[0] <= csv_serials) & (csv_serials <= netcdf_serials[-1]) + + new_vars = ['XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP'] + + new_units = ['kPa','umol/mol','umol/mol','Presumed PSU - not specified','deg C'] + + # For each of the variables in the csv file (except time), linearly + # interpolate to the timestamps of the netcdf file + for i in range(0,len(new_vars)): + + ncVarOut = ds.createVariable(new_vars[i], "f4", ("TIME",), fill_value=np.nan, zlib=True) + + ncVarOut[:] = np.interp(netcdf_serials,csv_serials[matching_index],np.array(dcsv[dcsv.columns[i+1]])[matching_index].astype('float64')) + + ncVarOut.units = new_units[i] + + ncVarOut.comment = "imported from 'SOFS_prelimdata_Nov2019.csv'" + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added 'XCO2_PRES','XCO2_OCEAN','XCO2_AIR','XCO2_PSAL','XCO2_SSTEMP' from 'SOFS_prelimdata_Nov2019.csv'") + + ds.close() + \ No newline at end of file diff --git a/ocean_dp/processing/add_density.py b/ocean_dp/processing/add_density.py new file mode 100755 index 0000000..9320ddf --- /dev/null +++ b/ocean_dp/processing/add_density.py @@ -0,0 +1,80 @@ +# Copyright (C) 2020 Ben Weeding and Peter Jansen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +from netCDF4 import Dataset +import sys +import gsw +import numpy as np +from datetime import datetime + +# add density to a data file with TEMP, PSAL, PRES variables, many assumptions are made about the input file +# based on Peter Jansen's addPSAL.py, using TEOS-10 + +def add_density(netCDFfile): + + # loads the netcdf file + ds = Dataset(netCDFfile, 'a') + + if 'DENSITY' in list(ds.variables): + + ds.close() + + return "file already contains density" + + # extracts the variables from the netcdf + var_temp = ds.variables["TEMP"] + var_psal = ds.variables["PSAL"] + var_pres = ds.variables["PRES"] + var_lon = ds.variables["LONGITUDE"] + var_lat = ds.variables["LATITUDE"] + + # extracts the data from the variables + t = var_temp[:] + psal = var_psal[:] + p = var_pres[:] + lon = var_lon[:] + lat = var_lat[:] + + # calculates absolute salinity + SA = gsw.SA_from_SP(psal, p, lon, lat) + + # calculates conservative temperature + CT = gsw.CT_from_t(SA, t, p) + + # calculates density + density = gsw.rho(SA, CT, p) + + # generates a new variable 'DENSITY' in the netcdf + ncVarOut = ds.createVariable("DENSITY", "f4", ("TIME",), fill_value=np.nan, zlib=True) # fill_value=nan otherwise defaults to max + + # assigns the calculated densities to the DENSITY variable, sets the units as kg/m^3, and comments on the variable's origin + ncVarOut[:] = density + ncVarOut.units = "kg/m^3" + ncVarOut.comment = "calculated using gsw-python https://teos-10.github.io/GSW-Python/index.html" + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + " : added DENSITY from TEMP, PSAL, PRES, LAT, LON") + + ds.close() + + +if __name__ == "__main__": + add_density(sys.argv[1]) diff --git a/ocean_dp/processing/add_mld.py b/ocean_dp/processing/add_mld.py new file mode 100755 index 0000000..112ce35 --- /dev/null +++ b/ocean_dp/processing/add_mld.py @@ -0,0 +1,101 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset, num2date +import sys +from datetime import datetime as dt +import numpy as np +import pandas as pd +from scipy import interpolate + +def add_mld(nc_in,thresh_in=0.2): + + # opens the supplied IMOS netcdf + nc = Dataset(nc_in,'a') + + temp_na = np.array(nc.variables['TEMP']) + + # create two nan filled arrays the length of the FV02 file, one for the mld and one for its uncertainty + nc_mld = np.full([1,temp_na.shape[1]], np.nan)[0] + + nc_mld_uncert = np.full([1,temp_na.shape[1]], np.nan)[0] + + # temp sensor depths + nc_temp_depths = np.array(nc.variables['DEPTH_TEMP']) + + temp_na = temp_na[nc_temp_depths>5,:] + + nc_temp_depths = nc_temp_depths[nc_temp_depths>5] + + # boolean of sensors at the shallowest depth + shallowest_sensors = nc_temp_depths == np.min(nc_temp_depths) + + # for each temperature profile where there is at least one non NaN value in the shallowest sensors + for i in np.where(~np.all(np.isnan(temp_na[shallowest_sensors]),axis=0))[0]: + + # check there is at least one non NaN value in the deeper sensors + if np.any(~np.isnan(temp_na[~shallowest_sensors,i])): + + # calculates the mean temperature of the available shallowest sensors to use as a reference to calculate MLD + shallow_temp = np.nanmean(temp_na[shallowest_sensors,i]) + + # extract temperature and depth data using a mean for the shallowest depth, and all non NaN data below + profile_temps = np.append(shallow_temp,temp_na[~shallowest_sensors,i][~np.isnan(temp_na[~shallowest_sensors,i])]) + + profile_depths = np.append(nc_temp_depths[0],nc_temp_depths[~shallowest_sensors][~np.isnan(temp_na[~shallowest_sensors,i])]) + + # check if the current profile contains any temperatures outside the specified threshold values + if np.any(temp_na[~shallowest_sensors,i]>=shallow_temp+thresh_in) or np.any(temp_na[~shallowest_sensors,i]<=shallow_temp-thresh_in): + + # generate a linear interpolator for the profile, which returns nan if extrapolation is attempted + profile_interp_func = interpolate.interp1d(profile_temps,profile_depths,bounds_error=False,fill_value=np.nan) + + # finds the shallowest depth at which the linear interpolation of the profile meets a threshold limit + nc_mld[i] = np.nanmin(profile_interp_func([shallow_temp+thresh_in,shallow_temp-thresh_in])) + + # provides an estimate of uncertainty, by giving the distance to the furthest sensor used to interpolate the MLD + nc_mld_uncert[i] = np.max([np.abs(nc_mld[i]-[x for x in profile_depths if x < nc_mld[i]][-1]),np.abs(nc_mld[i]-next(x for x in profile_depths if x > nc_mld[i]))]) + + + # if none of the sensors are outside the threshold + else: + + # set the mld to the depth of the deepest non NaN sensor + nc_mld[i] = np.max(profile_depths) + + # set the uncertainty to the distance between the sensor and the bottom + nc_mld_uncert[i] = 4600 - nc_mld[i] + + # create the two variables + mld_var_out = nc.createVariable('MLDx', "f4", ("TIME",), fill_value=np.nan, zlib=True) + mld_var_out[:] = nc_mld + mld_var_out.units = 'm' + mld_var_out.comment = 'Calculated using the linear interpolation MLD algorithm found at: INSERT GITHUB ADDRESS' + + mld_uncert_var_out = nc.createVariable('MLDx_standard_error', "f4", ("TIME",), fill_value=np.nan, zlib=True) + mld_uncert_var_out[:] = nc_mld_uncert + mld_uncert_var_out.units = 'm' + + nc.close() + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/processing/netcdf_to_df.py b/ocean_dp/processing/netcdf_to_df.py new file mode 100755 index 0000000..4a409a4 --- /dev/null +++ b/ocean_dp/processing/netcdf_to_df.py @@ -0,0 +1,211 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from netCDF4 import Dataset, num2date +import pandas as pd +import numpy as np +from datetime import datetime as dt +import glob +import re + +# ============================================================================= +# Returns a list of the time series variables in a IMOS format +# netcdf file. Takes an open netcdf as its argument. +# ============================================================================= +def var_selector_inc_time(nc,qc=False,): + + x = [x for x in list(nc.variables) if ('_quality_control_' not in x) & (nc.variables[x].shape!=())] + + return x + +# ============================================================================= +# +# ============================================================================= +def netcdf_to_df(target_file): + + # open the inputted netcdf + nc = Dataset(target_file,mode='r') + + # creates a the list of variables to transfer to the dataframe + vars_to_transfer = var_selector_inc_time(nc) + + # creates the dataframe with column labels + df = pd.DataFrame(columns = vars_to_transfer) + + # sorts the columns alphabetically, with the relevant qc variable following each timeseries variable + df.sort_index(axis=1, inplace=True) + + # fill the dataframe from the netcdf, variable by variable + for cur_var in vars_to_transfer: + + df[cur_var] = np.array(nc.variables[cur_var]) + + # store deployment times in attributes + df.attrs['time_deployment_start'] = nc.time_deployment_start + df.attrs['time_deployment_end'] = nc.time_deployment_end + df.attrs['nominal_depth'] = nc.variables['NOMINAL_DEPTH'][0] + + # extract the column names + col_names = list(df.columns) + + # append the nominal depth to all column names + df.columns = [x.replace('quality_control',str(nc.variables['NOMINAL_DEPTH'][0])+'_quality_control') if 'quality_control' in x else x if 'TIME' in x else x + '_' + str(nc.variables['NOMINAL_DEPTH'][0]) for x in col_names] + + nc.close() + + return df + + +# ============================================================================= +# Takes +# ============================================================================= + +def combine_df(target_dfs): + + total_df = pd.DataFrame({'dummy' : []}) + + # for each of the dataframes in the list provided + for cur_df in target_dfs: + + # make a copy of the current dataframe to modify and combine + df = cur_df.copy() + + # convert the IMOS format times to datetime + df['TIME']=pd.to_timedelta(df['TIME'],unit='D')+dt(1950,1,1) + + # index the dataframe by time - for some reason this makes the df very slow to visually open and navigate!? + df = df.set_index('TIME') + + # extract and convert deployment times to datetime + start_time = dt.strptime(df.attrs['time_deployment_start'],'%Y-%m-%dT%H:%M:%SZ') + end_time = dt.strptime(df.attrs['time_deployment_end'],'%Y-%m-%dT%H:%M:%SZ') + + # trim the df to only include in water data + df = df.drop(df[(df.index < start_time) | (df.index > end_time)].index) + + # remove data with bad qc instead of setting to nan later in process, let resample do the work? + # but what if psal is bad but temp is good?? Need to think on this. + + # resamples using the max method, to create a df of the correct dimensions to fill + df_to_fill = df.resample('H',base=0.5).min() + + + # gets list of column names + col_names = list(df.columns) + + # makes a list of non qc column names + col_names_no_qc = [x for x in col_names if 'quality_control' not in x] + + # for each of the time series data columns + for cur_col in col_names_no_qc: + + # sets the value of non qc data to nan if the corresponding qc value is not satisfactory (0,1,2,7 at the moment) + #df.loc[(df[cur_col+'_quality_control'] > 2) & (df[cur_col+'_quality_control'] != 7), cur_col] = np.nan + # CAUSING UNEXPECTED NANS - FIX + df.loc[df[cur_col+'_quality_control'].isin([3,4,6,9]) , cur_col] = np.nan + + # extracts the time series data + dS = pd.Series(df[cur_col]) + + # makes a copy for bin counting + dS_1s = dS.copy() + + dS_1s[:] = 1 + + # resamples the series, interpoling linearly + dS_resampled = dS.resample('H',base=0.5).interpolate(method='index',axis=0,limit=1000000) + + + # count how many data points are in each shoulder bin + dS_bin_counts = dS_1s.resample('H',base=0.5).sum() + + # fill the interpolated data back into the dataframe + df_to_fill[cur_col] = dS_resampled + + # give any interpolated point without any data within its hour window a qc code of 7 + df_to_fill.loc[dS_bin_counts==0,[cur_col+'_quality_control']] = 7 + + # shift the timestamps to the middle of the hour sampling period + df_to_fill.index = df_to_fill.index + pd.Timedelta('30 min') + + total_df = pd.concat([total_df,df_to_fill], join='outer', axis=1) + + print(cur_df) + + print(len(total_df)) + + total_df.drop(['dummy'],axis=1,inplace=True) + + return total_df + + +# ============================================================================= +# +# ============================================================================= + +def depth_from_file(file_in): + + result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0]) + + return result + +# ============================================================================= +# +# ============================================================================= + +netcdfs = glob.glob('*FV01*.nc') + +netcdfs = sorted(netcdfs,key=depth_from_file) + +df_list = list() + +for cur_netcdf in netcdfs: + + df = netcdf_to_df(cur_netcdf) + + df_list.append(df) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/processing/panda_merger.py b/ocean_dp/processing/panda_merger.py new file mode 100755 index 0000000..13a789c --- /dev/null +++ b/ocean_dp/processing/panda_merger.py @@ -0,0 +1,132 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +from datetime import datetime as dt +from datetime import timedelta +import numpy as np +import glob +import pandas as pd +import re + + + + +files = glob.glob('*FV00*.nc') + +var_name = 'TEMP' + +def depth_from_file(file_in): + + result = int(re.findall(r'(?<=-)\w+(?=m_END)', file_in)[0]) + + return result + +def var_selector(file): + + [x for x in var_list if (x!='TIME') & ('_quality_control' not in x) & (nc.variables[x].shape!=())] + + return x + + + +def panda_combine(files,var_name): + + files.sort(key=depth_from_file) + + total_df = pd.DataFrame({'dummy' : []}) + + # make a sorting index for columns from nominal depths + + for cur_file in files: + + cur_nc = Dataset(cur_file,mode='r') + + cur_df = pd.DataFrame({'TIME':np.array(cur_nc.variables['TIME'][:]),var_name+'_'+str(cur_nc.variables['NOMINAL_DEPTH'][0]):np.array(cur_nc.variables[var_name][:])}) + + cur_df['TIME']=pd.to_timedelta(cur_df['TIME'],unit='D')+dt(1950,1,1) + + cur_df = cur_df.set_index('TIME') + + cur_df = cur_df.resample('H',base=0.5).mean() + + cur_df.index = cur_df.index + pd.Timedelta('30 min') + + total_df = pd.concat([total_df,cur_df], join='outer', axis=1) + + print(cur_file) + + print(len(total_df)) + + total_df.drop(['dummy'],axis=1,inplace=True) + + return total_df + + +result = re.findall(r'(?<=-)\w+(?=m_END)', cur_file) + + + + + + +# ============================================================================= +# Old proof of concept code +# ============================================================================= + +# # import two netcdf +# nc1 = Dataset('IMOS_ABOS-SOTS_COPST_20180801_SOFS_FV00_SOFS-7.5-2018-SBE37SMP-ODO-RS232-03715971-200m_END-20190324_C-20200401.nc',mode='r') + +# nc2 = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4048-45m_END-20190331_C-20200401.nc',mode='r') + +# # convert their time and temp data into dataframes +# df1 = pd.DataFrame({'TIME':np.array(nc1.variables['TIME'][:]),'TEMP_'+str(nc1.variables['NOMINAL_DEPTH'][0]):np.array(nc1.variables['TEMP'][:])}) + +# df2 = pd.DataFrame({'TIME':np.array(nc2.variables['TIME'][:]),'TEMP_'+str(nc2.variables['NOMINAL_DEPTH'][0]):np.array(nc2.variables['TEMP'][:])}) + +# # convert the times from days since 01-01-1950 to a datetime object +# df1['TIME']=pd.to_timedelta(df1['TIME'],unit='D')+dt(1950,1,1) + +# df2['TIME']=pd.to_timedelta(df2['TIME'],unit='D')+dt(1950,1,1) + +# # index the dataframes by time +# df1=df1.set_index('TIME') + +# df2=df2.set_index('TIME') + + +# # resample the data, calculating the mean over hourly periods, starting on the half hour +# df1=df1.resample('H',base=0.5).mean() + +# df2=df2.resample('H',base=0.5).mean() + +# # reset the labels so they read the hour in the centre of the averaging period +# df1.index = df1.index + pd.Timedelta('30 min') + +# df2.index = df2.index + pd.Timedelta('30 min') + + +# # combine the two dataframes based on their time indicies, recording nan if one sensor doesn't have a reading for that timestamp +# total_df = pd.concat([df1,df2], join='outer', axis=1) + + + + + + + diff --git a/ocean_dp/processing/pressure_interpolator.py b/ocean_dp/processing/pressure_interpolator.py new file mode 100755 index 0000000..0473144 --- /dev/null +++ b/ocean_dp/processing/pressure_interpolator.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 4 11:05:16 2020 + +@author: tru050 +""" + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import numpy as np +import sys +from netCDF4 import Dataset +import numpy +import argparse +import glob +import pandas as pd +import scipy +import os +import shutil + +# Supply netCDFfiles as a ['list'] of files, agg as a 'string' + +def pressure_interpolator(netCDFfiles = [],agg = []): + + files_out = [] + + if netCDFfiles==[]: + + print('netcdffiles = none') + + # Load the filenames of the fv01 files in the current folder + netCDFfiles = glob.glob('*FV01*.nc') + + if agg == []: + + print('agg = none') + + # Extract the aggregate file data + agg = Dataset(glob.glob('*Aggregate*.nc')[0], mode="r") + + else: + + agg = Dataset(glob.glob(agg)[0], mode="r") + + # Loop through each of the fv01 files + for fn in netCDFfiles: + + print('File selected is '+fn) + + # Change the creation date in the filename to today + now=datetime.utcnow() + + fn_new_split = fn.split('_') + + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" + + fn_new_split[2] += 'IP' + + fn_new = '_'.join(fn_new_split) + + + # If a new (different) filename has been successfully generated, make + # a copy of the old file with the new name + if fn_new != fn: + + files_out.append(fn_new) + + print('copying file') + # copy file + shutil.copy(fn, fn_new) + + # Open and work in the new copy + fv01_contents = Dataset(fn_new,mode='a') + + print('copied file opened') + + # Check the current file doesn't contain pressure to run the following + # interpolator + if not 'PRES' in fv01_contents.variables: + + print("file doesn't contain pressure") + + print(fv01_contents.variables.keys()) + print(agg.variables.keys()) + + # Create a NaN array to fill with pressure values + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) + + # Set the first row as zeros to set 0m as 0dbar + interp_agg_pres[0,:] = 0 + + # Create a new array representing the nominal depths of the agg file, + # including the 0m values + agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) + + # For each nominal depth, interpolate the agg data at the fv01 times + for j in range(1,len(agg_nominal_depths)): + + time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] + + pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] + + interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection) + + # Sort the nominal depths and pressures according to nominal depth + interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] + + agg_nominal_depths.sort() + + # If there are any NaN values, linearly interpolate profilewise + if np.isnan(np.sum(interp_agg_pres)): + + # Make a dataframe of the interpolated pressure to handle NaNs easily + interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) + + # Find all the columns where the lowest element is NaN + nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna()) + + # Select each column containing an NaN as the deepest value + for j in nan_cols: + + # Find the shallowest nominal depth that isn't NaN + shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) + + # Find the index of that nominal depth + shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) + + # Starting at the shallowest NaN in a continous block of NaNs to the bottom + for k in range(shallowest_idx+1,len(interp_agg_pres_df)): + + # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar + interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] + + # Linearly interpolate any remaining NaNs + interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") + + # Convert the DataFrame back to an array + interp_agg_pres = interp_agg_pres_df.to_numpy() + + # Create a NaN array to receive the fv01 interpolated pressures + interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan) + + # At each timestamp, interpolate pressure for the fv01 data + for j in range(len(fv01_contents.variables["TIME"])): + + interp_fv01_pres[j] = np.interp(fv01_contents.variables["NOMINAL_DEPTH"][0],agg_nominal_depths,interp_agg_pres[:,j]) + + # Create the PRES and PRES_quality_control variables, and their attributes + + pres_var = fv01_contents.createVariable('PRES','f8',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True) + + pres_atts = ['long_name','sea_water_pressure_due_to_sea_water','units','dbar', + 'standard_name','coordinates','TIME LATITUDE LONGITUDE NOMINAL_DEPTH','sea_water_pressure_due_to_sea_water','valid_max', + 12000,'valid_min',-15] + + for att_name,att_value in zip(pres_atts[0::2],pres_atts[1::2]): + + pres_var.setncattr(att_name,att_value) + + pres_var[:] = interp_fv01_pres + + + pres_qc_var = fv01_contents.createVariable('PRES_quality_control','i1',fv01_contents.variables['TIME'].dimensions,fill_value=99, zlib=True) + + pres_qc_var.long_name = "quality_code for PRES" + + pres_qc_var.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + + pres_qc_var.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + pres_qc_var[:] = 7 + + pres_var.ancillary_variables = "PRES_quality_control" + + # Close the netcdf files + + fv01_contents.close() + + # Deal with files that already contain pressure, but contain NaNs + elif any(np.isnan(np.array(fv01_contents.variables['PRES'][:]))): + + print("file contains pressure and agg contains NaNs") + + # Create a NaN array to fill with pressure values + interp_agg_pres = np.full((len(agg.variables["NOMINAL_DEPTH"])+1,len(fv01_contents.variables["TIME"])),np.nan) + + # Set the first row as zeros to set 0m as 0dbar + interp_agg_pres[0,:] = 0 + + # Create a new array representing the nominal depths of the agg file, + # including the 0m values + agg_nominal_depths = np.insert(np.array(agg.variables["NOMINAL_DEPTH"][:]),0,0) + + # For each nominal depth, interpolate the agg data at the fv01 times + for j in range(1,len(agg_nominal_depths)): + + time_selection = agg.variables["TIME"][agg.variables["instrument_index"][:]==(j-1)] + + pres_selection = agg.variables["PRES"][agg.variables["instrument_index"][:]==(j-1)] + + interp_agg_pres[j,:] = np.interp(fv01_contents.variables["TIME"][:],time_selection,pres_selection) + + # Sort the nominal depths and pressures according to nominal depth + interp_agg_pres = interp_agg_pres[np.argsort(agg_nominal_depths),:] + + agg_nominal_depths.sort() + + # Make a dataframe of the interpolated pressure to handle NaNs easily + interp_agg_pres_df = pd.DataFrame(data=interp_agg_pres,index=agg_nominal_depths) + + # Find all the columns where the lowest element is NaN + nan_cols = np.where(interp_agg_pres_df.iloc[-1].isna()) + + # Select each column containing an NaN as the deepest value + for j in nan_cols: + + # Find the shallowest nominal depth that isn't NaN + shallowest_val = pd.Series.last_valid_index(interp_agg_pres_df.iloc[:,j]) + + # Find the index of that nominal depth + shallowest_idx = interp_agg_pres_df.index.tolist().index(shallowest_val) + + # Starting at the shallowest NaN in a continous block of NaNs to the bottom + for k in range(shallowest_idx+1,len(interp_agg_pres_df)): + + # Linearly interpolate from shallow to deep, based on a nominal depth difference of 1m equating to 1dbar + interp_agg_pres_df.iloc[k,j] = interp_agg_pres_df.iloc[k-1,j]+np.diff(interp_agg_pres_df.index)[k-1] + + # Linearly interpolate any remaining NaNs + interp_agg_pres_df = interp_agg_pres_df.interpolate(method="index") + + # Convert the DataFrame back to an array + interp_agg_pres = interp_agg_pres_df.to_numpy() + + # Create a NaN array to receive the fv01 interpolated pressures + interp_fv01_pres = np.full((np.shape(fv01_contents.variables["TIME"][:])),np.nan) + + # Extract the interpolated pressures (NaNs removed) to store in netCDF4 + interp_fv01_pres = interp_agg_pres_df[interp_agg_pres_df.index==fv01_contents.variables["NOMINAL_DEPTH"][:]] + + # Find indices where the netcdf data and interpolated data don't match (where the NaNs are in the netcdf) + nan_rep_idx = np.where(interp_fv01_pres!=fv01_contents.variables['PRES'][:])[1] + + # + fv01_contents.variables['PRES_quality_control'][nan_rep_idx] = 7 + + print('QC altered in original press') + + # Insert pressure value with NaNs interpolated back into netcdf + fv01_contents.variables['PRES'][:] = interp_fv01_pres + + print('press altered in orginal press') + + fv01_contents.close() + + agg.close() + + return files_out + + + + + diff --git a/ocean_dp/qc/add_qc_flags.py b/ocean_dp/qc/add_qc_flags.py index 7d00c63..1d1fa0e 100644 --- a/ocean_dp/qc/add_qc_flags.py +++ b/ocean_dp/qc/add_qc_flags.py @@ -18,11 +18,12 @@ from netCDF4 import Dataset, num2date import sys - +from datetime import datetime import numpy as np from dateutil import parser import pytz import os +import shutil # add QC variables to file @@ -32,8 +33,31 @@ def add_qc(netCDFfile): new_name = [] # list of new file names # loop over all file names given - for fn in netCDFfile[1:]: - ds = Dataset(fn, 'a') + for fn in netCDFfile: + + # rename the file FV00 to FV01 (imos specific) + fn_new = fn.replace("FV00", "FV01") + + # Change the creation date in the filename to today + now=datetime.utcnow() + + + + fn_new = "".join((fn_new[0:-11],now.strftime("%Y%m%d"),fn_new[-3::])) + + # Add the new file name to the list of new file names + new_name.append(fn_new) + + # If a new (different) filename has been successfully generated, make + # a copy of the old file with the new name + if fn_new != fn: + # copy file + shutil.copy(fn, fn_new) + + + print(fn_new) + + ds = Dataset(fn_new, 'a') # read the variable names from the netCDF dataset vars = ds.variables @@ -51,29 +75,33 @@ def add_qc(netCDFfile): if "TIME" in vars[v].dimensions: # print("time dim ", v) - ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 - ncVarOut[:] = np.zeros(vars[v].shape) - ncVarOut.long_name = "quality_code for " + v + if v+"_quality_control" not in ds.variables: + ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 + ncVarOut[:] = np.zeros(vars[v].shape) + ncVarOut.long_name = "quality_code for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + - vars[v].ancillary_variables = v + "_quality_control" + vars[v].ancillary_variables = v + "_quality_control" - # update the file version attribute + # update the global attributes ds.file_version = "Level 1 - Quality Controlled Data" + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + ds.history += ' ' + now.strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.' - ds.close() - - # rename the file FV00 to FV01 (imos specific) - fn_new = fn.replace("FV00", "FV01") - new_name.append(fn_new) + # ADD quality control attributes!! - if fn_new != fn: - # copy file - os.copy(fn, fn_new) + ds.close() - print(fn_new) return new_name if __name__ == "__main__": - add_qc(sys.argv) + add_qc(sys.argv[1:]) + + + \ No newline at end of file diff --git a/ocean_dp/qc/agg_temp_plot.py b/ocean_dp/qc/agg_temp_plot.py new file mode 100755 index 0000000..71ab427 --- /dev/null +++ b/ocean_dp/qc/agg_temp_plot.py @@ -0,0 +1,137 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + + +x=Dataset('IMOS_ABOS-SOTS_COPSTIP_20180822_SOFS_FV02_SOFS-Aggregate-TEMP_END-20190322_C-20200311.nc',mode='r') + +temp = np.array(x.variables['TEMP'][:]) + +time = np.array(x.variables['TIME'][:]) + +ins_idx = np.array(x.variables['instrument_index'][:]) + +fig, ax = plt.subplots(6,5) + +ax=ax.flatten() + +label_coords = (0.1, 0.8) +label_method = 'axes fraction' + +# cmap = colors.ListedColormap(['black','green','blue','red','orange']) + +# boundaries = [0,5,10,20,40,80] + +cmap = colors.ListedColormap(['blue','orange','red']) + +boundaries = [0,20,30,500] + +norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True) + + +for i in set(np.array(ins_idx)): + + cur_temp = temp[ins_idx==i] + + cur_time = time[ins_idx==i] + + cur_time_hr = cur_time*24 + + # Calculate time changes + cur_time_hr_diffs = np.diff(cur_time_hr) + + cur_temp_diffs = np.diff(cur_temp) + + # Calculate the rate of change of temperature wrt time + cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs) + + im = ax[i].scatter(cur_time,cur_temp,s=1,c=np.concatenate((np.array([0]),np.abs(cur_dtemp_dtime))),cmap=cmap,norm=norm) + + #ax[i].set_title(,fontsize=10) + + ax[i].annotate('Ins:'+str(i),xy=label_coords, xycoords=label_method,fontsize=8) + + if i==27: + fig.colorbar(im) + + +fig.colorbar(cmap) + +i=1 +fig, ax = plt.subplots() +ax.plot(time[ins_idx==i],temp[ins_idx==i]) + + +# Remove bad instruments +good_vals = [a!=14 and a!=15 for a in ins_idx] + +fig, ax = plt.subplots() +ax.hist(temp[good_vals],21) + +sofs75_temp_diffs = np.array([]) + +good_ins = set(np.array(ins_idx)) + +good_ins -= {14,15} + +for i in good_ins: + + cur_temp = temp[ins_idx==i] + + cur_time = time[ins_idx==i] + + cur_time_hr = cur_time*24 + + # Calculate time changes + cur_time_hr_diffs = np.diff(cur_time_hr) + + cur_temp_diffs = np.diff(cur_temp) + + # Calculate the rate of change of temperature wrt time + cur_dtemp_dtime = np.divide(cur_temp_diffs,cur_time_hr_diffs) + + print('ins '+str(i)+':'+str(np.max(cur_dtemp_dtime))) + + sofs75_temp_diffs = np.concatenate((sofs75_temp_diffs,cur_dtemp_dtime)) + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/arg_tester.py b/ocean_dp/qc/arg_tester.py new file mode 100755 index 0000000..2bfe7b4 --- /dev/null +++ b/ocean_dp/qc/arg_tester.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +def learn_arg_sys(text_in): + + print(text_in) + +# USE runfile('arg_tester.py', args='bing') + + +#if __name__ == "__main__": + # usage is <*args> +learn_arg_sys(text_in=sys.argv[1]) \ No newline at end of file diff --git a/ocean_dp/qc/flatline_test.py b/ocean_dp/qc/flatline_test.py new file mode 100755 index 0000000..79ab981 --- /dev/null +++ b/ocean_dp/qc/flatline_test.py @@ -0,0 +1,135 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def flatline_test_all_files(target_vars_in=[], window=5, flag=4): + target_files = glob.glob('IMOS*.nc') + + flatline_test_files(target_files, target_vars_in=target_vars_in, window=window, flag=flag) + + +def flatline_test_files(target_files, target_vars_in=[], window=5, flag=4): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the flat line test + flatline_test(nc=nc, target_vars_in=target_vars_in, window=window, flag=flag) + + +def flatline_test(nc, target_vars_in=[], window=5, flag=4): + + print('Window is '+str(window)) + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each variable, extract the data + for current_var in target_vars: + + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_flt" in nc.variables: + print('flt qc variable already present') + ncVarOut = nc.variables[nc_var.name + "_quality_control_flt"] + ncVarOut[:] = 0 + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_flt", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = 0 + # print(all(nc.variables[nc_var.name + "_quality_control_flt"]==0)) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_flt" + + var_data = np.array(nc.variables[current_var][:]) + + if (all(nc.variables[nc_var.name + "_quality_control_flt"][:] == 0)): + print('All test specific qc values are zero before filling') + + print('checking ' + current_var) + + print('Window is ' + str(window)) + + # Step through the data, one element at a time, using the window + for i in range(0, (len(var_data) - window + 1)): + + # This is true if 'window' elements in a row are equal + if len(set(var_data[i:(i + window)])) == 1: + print(str(i)) + # set corresponding QC value to... + ncVarOut[i:(i + window)] = flag + + points_marked = len([elem for elem in ncVarOut[:] if elem == 4]) + print('Data points flagged: ', points_marked) + + qc_var = nc.variables[current_var + "_quality_control"] + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'flatline_test performed on [' + str(target_vars) + '], window '+str(window)+' consecutive values or more were flagged with '+str(flag) ) + + nc.close() + +if __name__ == "__main__": + # usage is + flatline_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], window=float(sys.argv[3]), flag=float(sys.argv[4])) diff --git a/ocean_dp/qc/global_range.py b/ocean_dp/qc/global_range.py index e11bf40..db3b80b 100644 --- a/ocean_dp/qc/global_range.py +++ b/ocean_dp/qc/global_range.py @@ -28,29 +28,72 @@ # flag 4 (bad) when out of global range -def global_range(netCDFfile, variable, max, min): +def global_range(netCDFfile, variable, max, min, qc_value=4): ds = Dataset(netCDFfile, 'a') - var = ds.variables[variable] + nc_var = ds.variables[variable] + var_data = nc_var[:] + var_data.mask = False try: - var_qc = ds.variables[variable + "_quality_control"] + # find the existing quality_control variable in the auxillary variables list + aux_vars = nc_var.ancillary_variables + aux_var = aux_vars.split(" ") + qc_vars = [i for i in aux_var if i.endswith("_quality_control")] + qc_var = qc_vars[0] + print("QC var name ", qc_var) + var_qc = ds.variables[qc_var] except KeyError: print("no QC variable found") return None + # read existing quality_control flags + qc = var_qc[:] + # this is where the actual QC test is done - mask = ((var[:] > max) | (var[:] < min)) + mask = ((var_data > max) | (var_data < min)) + print('mask data ', mask) + + # create a qc variable just for this test flags + if nc_var.name + "_quality_control_gr" in ds.variables: + ncVarOut = ds.variables[nc_var.name + "_quality_control_gr"] + ncVarOut[:] = 0 + else: + ncVarOut = ds.createVariable(nc_var.name + "_quality_control_gr", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_gr" + + # store the qc flags + ncVarOut[mask] = qc_value - mask = mask & (var_qc[:] < 1) # only mark data that has not been QCd already + # store qc flags to main quality_control flags variable + mask = mask & (qc < 1) # only mark data that has not been QCd already + print('mask other qc ', mask) - var_qc[mask] = 4 - count = sum(mask) - print('marked records ', count) + qc[mask] = qc_value # mark the out of range points with bad_data + + # calculate the number of points marked as bad_data + marked = np.zeros_like(qc) + marked[mask] = 1 + count = sum(marked) + print('marked records ', count, mask, qc) + + # write flags back to main QC variable + var_qc[:] = qc # update the history attribute - history = ds.history - ds.setncattr("history", history + "\n" + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count)) + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.setncattr("history", hist + datetime.utcnow().strftime("%Y-%m-%d") + " " + variable + " global range min = " + str(min) + " max = " + str(max) + " marked " + str(count)) + + ds.variables[variable + "_quality_control"][:] = np.maximum(ds.variables[variable + "_quality_control_gr"][:],ds.variables[variable + "_quality_control"][:]) ds.close() @@ -59,5 +102,8 @@ def global_range(netCDFfile, variable, max, min): if __name__ == "__main__": - # usage is - global_range(sys.argv[1], sys.argv[2], float(sys.argv[3]), float(sys.argv[4])) + # usage is + if len(sys.argv) > 5: + global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4]), qc_value=int(sys.argv[5])) + else: + global_range(sys.argv[1], sys.argv[2], max=float(sys.argv[3]), min=float(sys.argv[4])) \ No newline at end of file diff --git a/ocean_dp/qc/in_out_water.py b/ocean_dp/qc/in_out_water.py index ac14840..7ec3e4d 100644 --- a/ocean_dp/qc/in_out_water.py +++ b/ocean_dp/qc/in_out_water.py @@ -18,53 +18,104 @@ from netCDF4 import Dataset, num2date import sys - +from datetime import datetime import numpy as np from dateutil import parser import pytz import os -# flag out of water as QC value 7 (not_deployed), with wise leave as 0 +# flag out of water as QC value 6 (not_deployed), with wise leave as 0 -def in_out_water(netCDFfile): - ds = Dataset(netCDFfile, 'a') +def in_out_water(fn, var_name=None): - vars = ds.variables + out_file = [] - to_add = [] - for v in vars: - #print (vars[v].dimensions) - if v != 'TIME': - to_add.append(v) - time_var = vars["TIME"] + ds = Dataset(fn, 'a') + + nc_vars = ds.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + if "TIME" in nc_vars[v].dimensions: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + # remove any anx variables from the list + for v in nc_vars: + if 'ancillary_variables' in nc_vars[v].ncattrs(): + remove = nc_vars[v].getncattr('ancillary_variables').split(' ') + print("remove ", remove) + for r in remove: + to_add.remove(r) + + time_var = nc_vars["TIME"] time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) - print(time_deploy) - - print(to_add) - for v in to_add: - if "TIME" in vars[v].dimensions: - - if v.endswith("_quality_control"): + print('file', fn) + print('deployment time', time_deploy) - print("QC time dim ", v) + print('var to add', to_add) - ncVarOut = vars[v] - mask = (time <= time_deploy) | (time >= time_recovery) - ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7 + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + for v in to_add: + print("var", v, ' dimensions ', nc_vars[v].dimensions) + + ncVarOut = nc_vars[v + "_quality_control"] + ncVarOut[mask] = 6 + + # create a qc variable just for this test flags + if v + "_quality_control_io" in ds.variables: + ncVarOut = ds.variables[v + "_quality_control_io"] + ncVarOut[:] = 0 + else: + ncVarOut = ds.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_vars[v].long_name + try: + ncVarOut.standard_name = nc_vars[v].standard_name + " status_flag" + except AttributeError: + pass + + ncVarOut.quality_control_conventions = "IMOS standard flags" + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + ncVarOut.comment = 'data flagged not deployed (6) when out of water' + + ncVarOut[mask] = 6 + # calculate the number of points marked as bad_data + marked = np.zeros_like(ncVarOut) + marked[mask] = 1 + count = sum(marked) ds.file_version = "Level 1 - Quality Controlled Data" + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' : ' + ' marked ' + str(int(count))) ds.close() - return netCDFfile + out_file.append(fn) + + return out_file if __name__ == "__main__": - in_out_water(sys.argv[1]) + if len(sys.argv) > 2 & sys.argv[1].startswith('-'): + in_out_water(sys.argv[2:], var_name=sys.argv[1][1:]) + else: + in_out_water(sys.argv[1:]) \ No newline at end of file diff --git a/ocean_dp/qc/netcdf_gen.py b/ocean_dp/qc/netcdf_gen.py new file mode 100755 index 0000000..1b56de8 --- /dev/null +++ b/ocean_dp/qc/netcdf_gen.py @@ -0,0 +1,163 @@ + # Copyright (C) 2020 Ben Weeding + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by + # the Free Software Foundation, either version 3 of the License, or + # (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + # + # You should have received a copy of the GNU General Public License + # along with this program. If not, see . + +from netCDF4 import Dataset, date2num +import sys +from datetime import datetime +import numpy as np +import shutil + + # Provide the function with a filename (don't include .nc), a nominal depth, +# and pairs of names and arrays containing the data to be included as variables. +# A time dimension/variable is created by default, starting at 01/01/2020 using +# 1 hour timestamps + + # For example, netcdf_gen('test',30,'PRES',pres_data,'TEMP',temp_data) +# from the command line gen_test_data.py test 30 PRES 10,20,30 TEMP 11,12,NaN + + + +def netcdf_gen(file_name, nominal_depth, *args): + # Convert the args tuple to a list + args = list(args) + #print(args, type(args[1])) + + file_name = "IMOS_" + file_name + ".nc" # if we insist on not wanting to pass these + + # deal with passing nominal depth as a string + if isinstance(nominal_depth, str): + nominal_depth = float(nominal_depth) + print('nominal depth :', nominal_depth) + + # Check the args are paired + if len(args) % 2 == 0: + + # Assign the names and data to lists + var_names = args[0::2] + + # deal with passing data as a string list of values + if isinstance(args[1], str): + var_data = [[float(b) for b in a.split(',')] for a in args[1::2]] + #print('var_data split', var_data) + else: + var_data = args[1::2] + + # Check if first of each pair is a string + if all(isinstance(x, str) for x in var_names): + + # Check if second of each pair are all equal in shape + if all(np.shape(var_data[0]) == np.shape(x) for x in var_data): + + # Create the netcdf with IMOS tag + ds = Dataset(file_name, "w", format="NETCDF4") + + # Create time dimension with length to match data + time_dim = ds.createDimension("TIME", len(var_data[0])) + + time_var = ds.createVariable("TIME", "f8", ("TIME")) + + time_var.setncattr('long_name', 'time') + time_var.setncattr('standard_name', 'time') + time_var.setncattr('units', 'days since 1950-01-01 00:00:00 UTC') + time_var.setncattr('calendar', 'gregorian') + time_var.setncattr('axis', 'T') + time_var.setncattr('valid_max', 90000) + time_var.setncattr('valid_min', 0) + + t0 = date2num(datetime(2020, 1, 1), units=time_var.units) + ds.variables['TIME'][:] = np.linspace(t0,t0 + (1 / 24) * (len(var_data[0])-1),num=len(var_data[0])) + + # Create the nominal depth variable + nom_depth_var = ds.createVariable("NOMINAL_DEPTH", "f8") + nom_depth_var.setncattr('long_name', 'nominal depth') + nom_depth_var.setncattr('units', 'dbar') + nom_depth_var.setncattr('positive', 'down') + nom_depth_var.setncattr('axis', 'Z') + nom_depth_var.setncattr('valid_max', 12000) + nom_depth_var.setncattr('valid_min', -5) + nom_depth_var.setncattr('reference_datum', 'sea surface') + + ds.variables["NOMINAL_DEPTH"][:] = nominal_depth + + # Create variables from input data + for name_in, data_in in zip(var_names, var_data): + ds.createVariable(name_in, "f8", ("TIME")) + ds.variables[name_in][:] = data_in + + + # read the variable names from the netCDF dataset + vars = ds.variables + + # create a list of variables, don't include the 'TIME' variable + # TODO: detect 'TIME' variable using the standard name 'time' + to_add = [] + + for v in vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + # for each variable, add a new ancillary variable _quality_control to each which has 'TIME' as a dimension + for v in to_add: + if "TIME" in vars[v].dimensions: + # print("time dim ", v) + + if v+"_quality_control" not in ds.variables: + ncVarOut = ds.createVariable(v+"_quality_control", "i1", vars[v].dimensions, fill_value=99, zlib=True) # fill_value=99 otherwise defaults to max, imos-toolbox uses 99 + ncVarOut[:] = np.zeros(vars[v].shape) + ncVarOut.long_name = "quality_code for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9]) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + + vars[v].ancillary_variables = v + "_quality_control" + + # update the global attributes + ds.file_version = "Level 1 - Quality Controlled Data" + + ds.history = datetime.utcnow().strftime("%Y%m%d:") + ' converted to FV01 file, quality_control variables added.' + + # ADD quality control attributes!! + + ds.close() + + #add_qc(file_name) + + print("generated ", file_name) + + return (file_name) + + + + else: + print('Data arrays not of equal length') + + + else: + print('Labels not in string format') + + else: + print('Data not passed in pairs') + + +if __name__ == "__main__": + netcdf_gen(sys.argv[1], sys.argv[2], *sys.argv[3:]) + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/psal_stat_plot.py b/ocean_dp/qc/psal_stat_plot.py new file mode 100755 index 0000000..cb9cbc7 --- /dev/null +++ b/ocean_dp/qc/psal_stat_plot.py @@ -0,0 +1,357 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +############################# Data extraction ################################ + +# creates an empty array to store the names of the SOTS deployments +deployments = [] + +checked_files = [] + +processed_files = [] + +# loops through all the folders and files contained in the folder +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): + + deployments.append(x) + + +# create a dataframe to store extract information +sots_psal_ensemble = pd.DataFrame(columns = ["dPsal/dtime","dPsal/dSample","QC","Nominal depth","Deployment"]) + + +# loops through all files in the directory +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'a') + + # check file contains psalerature data + if 'PSAL' in list(nc.variables): + + # check that the in_out_water test has been run on the file, if not run in_out_water code + if not 'PSAL_quality_control_io' in list(nc.variables): + + # run in_out_water script - uncommented at this point as just copied and pasted + var_name = 'PSAL' + nc_vars = nc.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + print('deployment time', time_deploy) + + print(to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + if "TIME" in nc_vars[v].dimensions: + if v.endswith("_quality_control"): + print("QC time dim ", v) + + ncVarOut = nc_vars[v] + ncVarOut[mask] = 7 + else: + # create a qc variable just for this test flags + if v + "_quality_control_io" in nc.variables: + ncVarOut = nc.variables[v + "_quality_control_io"] + else: + ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_vars[v].shape) + ncVarOut.long_name = "quality flag for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ncVarOut[mask] = 7 + + nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:]) + + nc.file_version = "Level 1 - Quality Controlled Data" + + + + # check that the file has a single dimension psalerature vector, and that the time format is correct + if np.array(nc.variables['PSAL'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # calculate psalerature changes for in water data + nc_psal_diffs = np.diff(np.array(nc.variables['PSAL'][np.array(nc.variables['PSAL_quality_control'][:])!=7])) + + # extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['PSAL_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes in hours + nc_time_hr_diffs = np.diff(nc_time_hr) + + # calculate the rate of change of psalerature wrt time (degrees °C per hour) + nc_dpsal_dtime = np.divide(nc_psal_diffs,nc_time_hr_diffs) + + + + # extract psal_qc data + nc_psal_qc = np.array(nc.variables['PSAL_quality_control'][np.array(nc.variables['PSAL_quality_control'][:])!=7]) + + # calculate qc values for each nc_dpsal_dtime by taking the maximum of the qc values of the two contributing psals + nc_dpsal_dtime_qc = pd.Series(nc_psal_qc).rolling(2).max().dropna().to_numpy() + + + + # extract sensor nominal depth + nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH']) + + # create a vector the same size as nc_dpsal_dtime with the nominal depth + nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dpsal_dtime)) + + + + # extract deployment name + nc_deployment = nc.deployment_code + + # create a list the same size as nc_dpsal_dtime with the deployment name + nc_deployment_list = [nc_deployment] * len(nc_dpsal_dtime) + + + + # combine information into an length x 4 dataframe + nc_psal_ensemble = pd.DataFrame({"dPsal/dtime":nc_dpsal_dtime,"dPsal/dSample":nc_psal_diffs,"QC":nc_dpsal_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + + # append the current netcdf's dataframe to the sots_psal_ensemble + sots_psal_ensemble = sots_psal_ensemble.append(nc_psal_ensemble) + + # append the filename to the list of processed files + processed_files.append(fname) + + + nc.close() + + +############################# Data processing ################################ + +# creates a new dataframe containing only data with QC < 3 +sots_psal_ensemble_qc210 = sots_psal_ensemble[sots_psal_ensemble["QC"]<3] + +# calculates overall standard deviation +std_total = np.std(sots_psal_ensemble_qc210["dPsal/dtime"]) + + + + +# creates an emply list to store data deployment by deployment +std_by_deployment_data = [] + +# creates a dict of deployment names and standard deviations +for i in sots_psal_ensemble_qc210.Deployment.unique(): + std_by_deployment_data.append( + { + 'Deployment': i, + 'STD': np.std(sots_psal_ensemble_qc210["dPsal/dtime"][sots_psal_ensemble_qc210["Deployment"]==i]), + 'STD sample': np.std(sots_psal_ensemble_qc210["dPsal/dSample"][sots_psal_ensemble_qc210["Deployment"]==i]) + } + ) + +# creates a Dataframe from the dict +std_by_deployment = pd.DataFrame(std_by_deployment_data) + + + + + +# ============================================================================= +# std_by_depth: this function takes two compulsary arguments (top: the shallowest +# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: +# the deployment from which data will be taken). The function will return the standard +# deviation of the d(psal)/d(Time) data from sensors with nominal depths at and +# between the two depths, and from only the deployment_in if specified. +# +# sample call: std_by_depth(500,10000,'SOFS-7.5-2018') +# +# this will give the std of all d(psal)/d(Time) data from SOFS-7.5-2018 from +# sensors with 500m <= nominal depth <= 10000m +# ============================================================================= + +def std_by_depth_psal(top,bottom,deployment_in=None,rate='time'): + + selection = '' + + if rate == 'time': + + selection = "dPsal/dtime" + + elif rate == 'sample': + + selection = "dPsal/dSample" + + else: + + return "incorrect rate specification" + + + + # if user incorrectly inputs depths, swap them and run the code + if top > bottom: + + top, bottom = bottom, top + + if deployment_in == None: + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom)] + + elif isinstance(deployment_in, list): + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210.Deployment.isin(deployment_in))] + + + else: + + # subsamples sots_psal_ensemble_qc210 based on depth + target_ensemble = sots_psal_ensemble_qc210[(sots_psal_ensemble_qc210["Nominal depth"]>=top) & (sots_psal_ensemble_qc210["Nominal depth"]<=bottom) & (sots_psal_ensemble_qc210["Deployment"]==deployment_in)] + + # if not data is available with the given choices, end the function + if len(target_ensemble)==0: + + return 'No data available for those choices' + + + # calculates the mean of the subsample + target_mean = np.mean(target_ensemble[selection]) + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble[selection]) + + # sets line thickness for plot + line_thick = 1 + + # creates axes for histogram + ax_hist=plt.axes() + + # plots a histogram of the data selected + target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist) + + # draws lines at the mean +- 3 STD on the histogram + ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) + + ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) + + # sets the x label + if rate == 'time': + + ax_hist.set_xlabel('PSU/hr') + + elif rate == 'sample': + + ax_hist.set_xlabel('PSU/sample') + + + label_coords = (0.70, 0.99) + label_method = 'axes fraction' + + anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) + + anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3)) + + anno += '\nno. samples = ' + str(len(target_ensemble)) + + anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m' + + if deployment_in == None: + + anno += '\nall available data' + + elif isinstance(deployment_in, list): + + anno += '\n' + + anno += '\n'.join(deployment_in) + + else: + + anno += '\n'+deployment_in + + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left") + + # returns the standard deviation of the subsample + return target_std + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/qc_checker.py b/ocean_dp/qc/qc_checker.py new file mode 100755 index 0000000..6fd7881 --- /dev/null +++ b/ocean_dp/qc/qc_checker.py @@ -0,0 +1,137 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +# Code is design to check that netcdf files processed using the SOTS methods +# conform to the QC labelling designed by Peter Jansen. + +def qc_checker_all_files(target_vars_in=[]): + + successful_files=[] + + target_files = glob.glob('IMOS*.nc') + + # Returns the files that conform to the qc labelling + successful_files = qc_checker_files(target_files, target_vars_in=target_vars_in) + + +def qc_checker_files(target_files,target_vars_in=[]): + + successful_files=[] + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # If the qc_checker was successfull, add the filename to the list + if qc_checker(nc,target_vars_in=target_vars_in): + + successful_files.append(current_file) + + # Returns the files that conform to the qc labelling + return successful_files + + +# Enter args as variable name and rate of change limit, ie. 'TEMP',4 +def qc_checker(nc,target_vars_in=[]): + + # Collect all the variables from the netcdf + all_vars = list(nc.variables.keys()) + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables (qc, single length, TIME) + if target_vars_in == []: + + target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each of the variables selected + for current_var in target_vars: + + # Collect the global qc data + qc_global_data = np.array(nc.variables[current_var+"_quality_control"][:]) + + # Collect the names of all the other test specific qc vectors + qc_test_specific = [s for s in all_vars if current_var+"_quality_control" in s and not s.endswith('control')] + + # For each of the other test specific qc vectors + for current_qc_test in qc_test_specific: + + # Extract the data + qc_test_data = np.array(nc.variables[current_qc_test]) + + #print('checking '+current_qc_test) + + # If any of the test specific qc vectors ever have a great value than the global qc vector at a timestamp, the qc process has failed + if any(np.less(qc_global_data,qc_test_data)): + + print(current_qc_test + "failed") + + qc_behaving = False + + else: + + # The qc process has succeeded + qc_behaving = True + + # sets all data with a qc value of 0 to have a qc value of 1, having passed all the tests + nc.variables[current_var+"_quality_control"][qc_global_data==0] = 1 + + now=datetime.utcnow() + + nc.history += ' ' + now.strftime("%Y%m%d:") + 'passed qc_checker, all qc=0 set to qc=1' + + # Returns true if qc has succeeded, false if not + return qc_behaving + + + # Close the current netcdf file + nc.close() + + +# def qc_check_plot(target_file,target_var) + +# nc = Dataset(target_file,'r') + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/rate_of_change_test.py b/ocean_dp/qc/rate_of_change_test.py new file mode 100755 index 0000000..84f8e4f --- /dev/null +++ b/ocean_dp/qc/rate_of_change_test.py @@ -0,0 +1,249 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +# how to specific rate of change? Will the function take rate of change as an +# argument? + +# Need a way of linking the rate of change to each different variable type + +# Could max rate of change be something we upload into the netcdf file atts? Or just in history? + +# Tell the function how much change you tolerate, and over what period of time - in sec? + +# Then convert to match the file timesteps + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def roc_test_all_files(*args,target_vars_in=[]): + target_files = glob.glob('IMOS*.nc') + + roc_test_files(target_files, target_vars_in=target_vars_in,*args) + +def roc_test_files(target_files,*args,target_vars_in=[]): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + print(args) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test - specifying *args here makes python unpack args to be passed again successfully as separate items + roc_test(nc,*args, target_vars_in=target_vars_in) + + +# Enter args as variable name and rate of change limit, ie. 'TEMP',4 +def roc_test(nc,*args,target_vars_in=[]): + + # Check the time format + if nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Convert the args tuple to a list + args = list(args) + + # If a single rate of change limit is supplied + if len(args) == 1: + + change_per_hr = args[0] + + print('One rate of change limit will be applied to all variables') + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + all_vars = list(nc.variables.keys()) + + target_vars = [s for s in all_vars if 'TIME' not in s and 'quality_control' not in s and nc.variables[s].size!=1] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # For each variable + for current_var in target_vars: + + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_roc" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape) + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + + # Extract the variable data + var_data = np.array(nc.variables[current_var][:]) + + print('Not equal to zero test type 1') + print(str(np.where(ncVarOut[:]!=0))) + + if (all(ncVarOut[:] == 0)): + print('All test specific qc values are zero before filling') + + # Calculate dvar/dtime + var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) + + # For any change greater than change_per_hr, assign a qc value of 4 + ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > change_per_hr]] = 4 + + # Extract global qc + qc_var = nc.variables[current_var + "_quality_control"] + + # Overwrite global qc with any higher values from test specific qc + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) + + + print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > change_per_hr])) + ' changes found above '+str(change_per_hr)+' '+nc.variables[current_var].units+' per hour') + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + 'rate of change test performed, with all changes above '+str(change_per_hr)+' flagged as 4') + + + # If multiple rate of change limits are supplied, with variable names + elif len(args) % 2 == 0 and all(isinstance(x,str) for x in args[0::2]) and all(isinstance(y,(float,int)) for y in args[1::2]): + + # Take target variables from args + target_vars = args[0::2] + + print('target_vars are '+' '.join(target_vars)) + + # Convert arguments to dict + rate_spec = dict(zip(args[0::2],args[1::2])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # For each variable + for current_var in target_vars: + + # Extract the variable + nc_var = nc.variables[current_var] + + if nc_var.name + "_quality_control_roc" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_roc"] + ncVarOut[:] = np.zeros(nc.variables[nc_var.name + "_quality_control_roc"].shape) + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_roc", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = 0 + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_roc" + + + # Extract the data + var_data = np.array(nc.variables[current_var]) + + print('Not equal to zero test type 2') + print(str(np.where(ncVarOut[:]!=0))) + + if (all(ncVarOut[:] == 0)): + print('All test specific qc values are zero before filling') + + # Calculate dvar/dtime + var_roc = np.divide(np.diff(var_data),np.diff(nc_time_hr)) + + # For any change greater than change_per_hr, assign a qc value of 4 + ncVarOut[[x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]]] = 4 + + print('ncVarOut 4s') + print(str(np.where(ncVarOut[:]==4))) + + print('Netcdf variable 4s') + print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4))) + + nc.variables[nc_var.name + "_quality_control_roc"][:] = ncVarOut[:] + + print('Netcdf variable 4s after assignment') + print(str(np.where(nc.variables[nc_var.name + "_quality_control_roc"][:]==4))) + + + # Extract global qc + qc_var = nc.variables[current_var + "_quality_control"] + + # Overwrite global qc with any higher values from test specific qc + qc_var[:] = np.maximum(ncVarOut[:], qc_var[:]) + + print(current_var + ' tested: '+str(sum([x for x in abs(np.insert(var_roc,0,0)) > rate_spec[current_var]])) + ' changes found above '+str(rate_spec[current_var])+' '+nc.variables[current_var].units+' per hour') + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': rate of change test performed, with all changes above those specified in the following list flagged as 4: '+str(args)) + + + else: + print('Arguments passed do not match the required format. No roc test performed.') + + + # If the time format doesn't match IMOS requirements + else: + print('Time format does not match the required IMOS form of: days since 1950-01-01 00:00:00 UTC') + + + nc.close() + + +# Not sure how to sys.argv[] with both *args and a keyword argument +if __name__ == "__main__": + # usage is <*args> + roc_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], *sys.argv[3:]) + + + + diff --git a/ocean_dp/qc/select_in_water.py b/ocean_dp/qc/select_in_water.py new file mode 100755 index 0000000..8c296c1 --- /dev/null +++ b/ocean_dp/qc/select_in_water.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 + +# ocean_dp +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from dateutil.parser import parse +from netCDF4 import Dataset, num2date, date2num +from datetime import datetime, timedelta +import sys +from datetime import datetime +import numpy as np +from dateutil import parser +import pytz +import os +import shutil + +# Submit argument as a list + + + +def select_in_water(netCDFfiles): + + new_name = [] # list of new file names + now = datetime.utcnow() + + # loop over all file names given + for fn in netCDFfiles: + + # Check the file is an IMOS formatted file + if fn.split('_')[0]=='IMOS': + fn_new = fn.replace("FV00", "FV01") + fn_new_split = fn_new.split('_') + # Change the creation date in the filename to today + fn_new_split[-1] = "C-" + now.strftime("%Y%m%d") + ".nc" + fn_new = '_'.join(fn_new_split) + else: + fn_new = fn.replace(".nc", "-trim.nc") + + # Add the new file name to the list of new file names + new_name.append(fn_new) + + # Load the original netcdf file + ods = Dataset(fn,'a') + + # Extract the time dimension, and the deployment start and end + # TODO: check this works + v = ods.get_variables_by_attributes(standard_name='time') + time = np.array(v[0][:]) + + inw = parse(ods.time_deployment_start) + outw = parse(ods.time_deployment_end) + + # Convert the start and end to the number format used in TIME + inw_num = date2num(inw.replace(tzinfo=None), units=ods.variables['TIME'].units) + outw_num = date2num(outw.replace(tzinfo=None), units=ods.variables['TIME'].units) + + # Create logical index of deployed times + deployed = np.logical_and(time>=inw_num, time<=outw_num) + + # Determine the length of the new time dimension + time_dim_len = len(time[deployed]) + + # Create the new netcdf file + ds = Dataset(fn_new, "w", format="NETCDF4") + + new_time_dim = ds.createDimension("TIME", time_dim_len) + + # Copy global attributes + for att in ods.ncattrs(): + ds.setncattr(att, ods.getncattr(att)) + + # Copy variables + for v_name, varin in ods.variables.items(): + + varout = ds.createVariable(v_name, varin.datatype, varin.dimensions) + + # Copy variable attributes + varout.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) + + # Fill variables with deployed data + # TODO: should check if the dimensions for the variable include TIME, and truncate that dimension + if np.array(varin[:]).size == 1: + varout[:] = varin[:] + else: + varout[:] = np.array(varin[:])[deployed] + + ds.date_created = now.strftime("%Y-%m-%dT%H:%M:%SZ") + + # update the time coverage attributes + ds.time_coverage_start = ods.time_deployment_start + ds.time_coverage_end = ods.time_deployment_end + + # update the history attribute + try: + hist = ds.history + "\n" + except AttributeError: + hist = "" + ds.history += hist + now.strftime("%Y%m%d:") + 'Data subset to only contain deployed (in water) data - the full record can be found in the corresponding FV00 file.' + + ds.close() + ods.close() + + return new_name + + +if __name__ == "__main__": + select_in_water(sys.argv[1:]) diff --git a/ocean_dp/qc/spike_test.py b/ocean_dp/qc/spike_test.py new file mode 100755 index 0000000..089cf63 --- /dev/null +++ b/ocean_dp/qc/spike_test.py @@ -0,0 +1,176 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +from datetime import datetime, timedelta +from netCDF4 import num2date, date2num +from netCDF4 import stringtochar +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os + +default_high = 100 + +default_low = 50 + + +# If files aren't specified, take all the IMOS*.nc files in the current folder +def spike_test_all_files(target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): + target_files = glob.glob('IMOS*.nc') + + spike_test_files(target_files, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + +def spike_test_files(target_files, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): + + # Loop through each files in target_files + for current_file in target_files: + # Print each filename + print("input file %s" % current_file) + + # Extract netcdf data into nc + nc = Dataset(current_file, mode="a") + + # run the spike test + spike_test(nc=nc, target_vars_in=target_vars_in, thresh_low=thresh_low,thresh_high=thresh_high,flag_low=flag_low, flag_high=flag_high) + + +def spike_test(nc, target_vars_in=[], thresh_low=default_low, thresh_high=default_high, flag_low=3, flag_high=4): + + # If target_vars aren't user specified, set it to all the variables of + # the current_file, removing unwanted variables + if target_vars_in == []: + + target_vars = list(nc.variables.keys()) + + # Remove TIME + target_vars.remove('TIME') + + # Remove any quality_control variables + qc_vars = [s for s in target_vars if 'quality_control' in s] + target_vars = [s for s in target_vars if s not in qc_vars] + + # Remove any variables of single length + single_vars = [s for s in target_vars if nc.variables[s].size==1] + target_vars = [s for s in target_vars if s not in single_vars] + + print('target_vars are '+' '.join(target_vars)) + + else: + target_vars = target_vars_in + + # For each variable + for current_var in target_vars: + + # Extract the variable + nc_var = nc.variables[current_var] + + # Create a test specific qc variable if it doesn't already exist + if nc_var.name + "_quality_control_spk" in nc.variables: + ncVarOut = nc.variables[nc_var.name + "_quality_control_spk"] + else: + ncVarOut = nc.createVariable(nc_var.name + "_quality_control_spk", "i1", nc_var.dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_var.shape) + ncVarOut.long_name = "quality flag for " + nc_var.name + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + # add new variable to list of aux variables + nc_var.ancillary_variables = nc_var.ancillary_variables + " " + nc_var.name + "_quality_control_spk" + + # Extract the variable data + var_data = np.array(nc.variables[current_var][:]) + + print('checking '+current_var+' for high spikes') + + # Step through the data, one element at a time, starting from the 2nd element + for i in range(1,(len(var_data)-1)): + + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + + # Check for spike exceeding high threshold + if (abs(var_data[i]-shoulder_mean) > thresh_high) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)):# & (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])): + + print('High spike found') + + #set corresponding QC value to... + ncVarOut[i] = flag_high + + + # Find the indices where qc isn't set to 4 (high spike), removing the final element as it can't be check for a spike + low_spike_chk_idx = np.where(ncVarOut[:]!=4)[0][0:-1] + + + # Remove from the indices those that are either side of a high spike + for i in ncVarOut[:]==4: + + low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i-1]] + + low_spike_chk_idx=low_spike_chk_idx[low_spike_chk_idx!=[i+1]] + + #print(low_spike_chk_idx) + + print('checking '+current_var+' for low spikes') + + # For each of the remaining indices + for i in low_spike_chk_idx: + + #print('i is '+str(i)) + + # Calculate the mean of the i-1 and i+1 elements + shoulder_mean = np.mean(np.take(var_data,[i-1,i+1])) + + # Calculate the step changes + shoulder_diff = np.diff(var_data[i-1:i+2]) + + # Check for spike exceeding low threshold + if (abs(var_data[i]-shoulder_mean) > thresh_low) & (True in (shoulder_diff>0)) & (True in (shoulder_diff<0)): #& (1.25*abs(shoulder_diff[0]) >= abs(x[1]) >= 0.75*abs(shoulder_diff[0])): + + print('Low spike found') + + #set corresponding QC value to... + ncVarOut[i] = flag_low + + nc.variables[current_var + "_quality_control"][:] = np.maximum(nc.variables[current_var + "_quality_control_spk"][:],nc.variables[current_var + "_quality_control"][:]) + + # update the history attribute + try: + hist = nc.history + "\n" + except AttributeError: + hist = "" + + nc.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ' :spike_test performed on [' + str(target_vars) + '], with spikes greater than '+str(thresh_high)+' flagged as '+str(flag_high)+' and spikes greater than '+str(thresh_low)+' flagged as '+str(flag_low)) + + nc.close() + +if __name__ == "__main__": + # usage is + spike_test_files(target_files=[sys.argv[1]], target_vars_in=[sys.argv[2]], thresh_low=float(sys.argv[3]), thresh_high=float(sys.argv[4]), flag_low= float(sys.argv[5]), flag_high= float(sys.argv[6])) + + + + + + + diff --git a/ocean_dp/qc/temp_diff_hist_extra.py b/ocean_dp/qc/temp_diff_hist_extra.py new file mode 100755 index 0000000..14dc6cf --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_extra.py @@ -0,0 +1,83 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob +from netCDF4 import num2date +from dateutil import parser +import datetime + +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + ds = Dataset(os.path.join(root,fname), 'a') + + vars = ds.variables + + to_add = [] + for v in vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(ds.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(ds.time_deployment_end, ignoretz=True) + + print(time_deploy) + + print(to_add) + for v in to_add: + if "TIME" in vars[v].dimensions: + + if v.endswith("_quality_control"): + + print("QC time dim ", v) + + ncVarOut = vars[v] + mask = (time <= time_deploy) | (time >= time_recovery) + ncVarOut[mask] = np.ones(vars[v].shape)[mask] * 7 + + + ds.file_version = "Level 1 - Quality Controlled Data" + + # update the history attribute + try: + hist = ds.history + "\n" + + except AttributeError: + hist = "" + + ds.setncattr('history', hist + datetime.utcnow().strftime("%Y-%m-%d") + ': in water test performed, with out of water data flagged at QC=7') + + + ds.close() \ No newline at end of file diff --git a/ocean_dp/qc/temp_diff_hist_glob b/ocean_dp/qc/temp_diff_hist_glob new file mode 100755 index 0000000..a87ca72 --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_glob @@ -0,0 +1,121 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + +fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc") + + +fig, ax = plt.subplots(4,4,sharex='all', sharey='all') + +ax=ax.flatten() + + +for current_deployment, plt_idx in zip(deployments, range(0,16)): + + + + for fname in files: + + if fname.find(current_deployment) and fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for all files + #all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime)) + + #all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime)) + + #netcdffiles.append(fname) + + #mins.append(np.amin(nc_dtemp_dtime)) + + #maxs.append(np.amax(nc_dtemp_dtime)) + + + + nc.close() + + ax[plt_idx].hist(nc_dtemp_dtime,100,log=True) + + ax[plt_idx].set_ylim(bottom=0.1,top=10E5) + + ax[plt_idx].set_xlim(left=-500, right=500) + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/temp_diff_hist_glob.py b/ocean_dp/qc/temp_diff_hist_glob.py new file mode 100755 index 0000000..1e63d14 --- /dev/null +++ b/ocean_dp/qc/temp_diff_hist_glob.py @@ -0,0 +1,119 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +import glob + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + +deployment_dtemp_dtime = np.array([]) + +fv01_files = glob.glob("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/*/*/*FV01*.nc") + + +fig, ax = plt.subplots(4,4,sharex='all', sharey='all') + +ax=ax.flatten() + + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print(current_deployment + 'files') + + for fname in fv01_files: + + if current_deployment in fname: + + #print(fname + ' contains ' + current_deployment) #Here, the wanted file name is printed + + nc = Dataset(fname, mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1: + + print(fname) + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + print('plotting '+ str(len(deployment_dtemp_dtime)) + ' values') + + ax[plt_idx].hist(deployment_dtemp_dtime,100) + + #ax[plt_idx].set_ylim(bottom=0.1,top=10E5) + + #ax[plt_idx].set_xlim(left=-10, right=10) + + deployment_dtemp_dtime = np.array([]) + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/temp_diff_histograms.py b/ocean_dp/qc/temp_diff_histograms.py new file mode 100755 index 0000000..a3f8420 --- /dev/null +++ b/ocean_dp/qc/temp_diff_histograms.py @@ -0,0 +1,335 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + + + +# check for in water test in history of netcdf file, if not perform the test + +netcdffiles = [] + +mins=[] + +maxs=[] + +all_dtemp_dtime = np.array([]) + +all_dtemp_dtime_deps = [] + +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if fname.endswith('.nc') and 'FV01' in fname: + + print(fname) #Here, the wanted file name is printed + + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP_quality_control' in list(nc.variables) and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # Calculate temperature changes + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for all files + all_dtemp_dtime = np.concatenate((all_dtemp_dtime,nc_dtemp_dtime)) + + all_dtemp_dtime_deps += ([nc.deployment_code] * len(nc_dtemp_dtime)) + + netcdffiles.append(fname) + + mins.append(np.amin(nc_dtemp_dtime)) + + maxs.append(np.amax(nc_dtemp_dtime)) + + nc.close() + + +fig, ax = plt.subplots() + +bins = np.linspace(-450,450,901) + +line_thick = 0.5 + +counts,bins,bars = ax.hist(all_dtemp_dtime,bins,log=True) + +ax.axvline(x=3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) + +ax.axvline(x=-3*np.std(all_dtemp_dtime),color='r',linewidth=line_thick) + +ax.set_title('Hourly temp changes from all FV01 files in SOTS-TEMP-Raw_Data') + +label_coords = (0.01, 0.9) + +label_method = 'axes fraction' + +ax.annotate('~1.84E7 measurements',xy=label_coords, xycoords=label_method) + + + +def last_four(entry): + + output = entry[-4::] + + return output + + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + +deployments.sort(key=last_four) + + +####### + + + + +all_deployment_dtemp_dtime = [None] * len(deployments) + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + #print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime + + nc.close() + + + + + + + + +fig, ax = plt.subplots(4,4) + +ax=ax.flatten() + +line_thick = 1 + +label_coords = (0.6, 0.6) +label_method = 'axes fraction' + +for plt_idx,dep_name in zip(range(0,len(deployments)),deployments): + + print('plotting '+ str((all_deployment_dtemp_dtime[plt_idx])) + ' values') + + hist_data = ax[plt_idx].hist(all_deployment_dtemp_dtime[plt_idx],21,log=True) + + ax[plt_idx].set_title(dep_name,fontsize=10) + + #ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick) + + ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])+3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) + + ax[plt_idx].axvline(x=np.mean(all_deployment_dtemp_dtime[plt_idx])-3*np.std(all_deployment_dtemp_dtime[plt_idx]),color='r',linewidth=line_thick) + + anno = 'mean = '+str(round(float(np.mean(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3)) + + anno += '\n3SD = ' + str(round(float(3*np.std(all_deployment_dtemp_dtime[plt_idx])),sigfigs=3)) + + anno += '\nsamples = ' + str(len(all_deployment_dtemp_dtime[plt_idx])) + + ax[plt_idx].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + + #ax[plt_idx].set_ylim(bottom=0,top=np.max(hist_data[0])) + + #ax[plt_idx].set_xlim(left=-450, right=450) np.linspace(-450,450,901) + +#ax[-1].axis('off') + +all_data = np.concatenate(all_deployment_dtemp_dtime) + +hist_data = ax[15].hist(all_data,21,log=True) + +ax[15].set_title('All data',fontsize=10) + +#ax[plt_idx].axvline(x=3*np.mean(all_deployment_dtemp_dtime[plt_idx]),color='g',linewidth=line_thick) + +ax[15].axvline(x=np.mean(all_data)+3*np.std(all_data),color='r',linewidth=line_thick) + +ax[15].axvline(x=np.mean(all_data)-3*np.std(all_data),color='r',linewidth=line_thick) + +anno = 'mean = '+str(round(float(np.mean(all_data)),sigfigs=3)) + +anno += '\n3SD = ' + str(round(float(3*np.std(all_data)),sigfigs=3)) + +anno += '\nsamples = ' + str(len(all_data)) + +ax[15].annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8) + + +fig.subplots_adjust(left=0.05,right=0.99,bottom=0.1,top=0.9,wspace=0.15,hspace=0.4) + + + + + + + + + + + + + + +files = glob.glob('*.nc') + +temp_diffs = np.array([]) + +for current_file in files: + + nc = Dataset(current_file,mode='r') + + temp_diffs = np.concatenate((temp_diffs,np.diff(np.array(nc.variables['TEMP'][:])))) + +fig, ax = plt.subplots() + +ax.hist(temp_diffs,100,log=True) + + + + +# use os.walk??? to run in each netcdf folder?? os.scandir()? + + + + + +# sofs75_60m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4051-60m_END-20190331_C-20200204.nc',mode='r') +# sofs75_70m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4052-70m_END-20190331_C-20200204.nc',mode='r') +# sofs75_75m = Dataset('IMOS_ABOS-SOTS_T_20180801_SOFS_FV00_SOFS-7.5-2018-Starmon-mini-4053-75m_END-20190331_C-20200204.nc',mode='r') + + +# temp_60 = np.array(sofs75_60m.variables['TEMP'][:]) +# temp_70 = np.array(sofs75_70m.variables['TEMP'][:]) +# temp_75 = np.array(sofs75_75m.variables['TEMP'][:]) + +# label_coords = (0.01, 0.85) +# label_method = 'axes fraction' + +# fig, axs = plt.subplots(3, 1, sharey=True) +# axs[0].set_title('Temp sensor comparison SOFS7.5') + +# axs[0].hist(np.diff(temp_60),bins=100,log=True, histtype='bar', stacked=True) +# axs[0].set_ylim(bottom=0.1,top=10E5) +# axs[0].set_xlim(left=-40, right=40) +# axs[0].annotate('60m',xy=label_coords, xycoords=label_method) +# axs[0].tick_params(labelbottom=False) + +# axs[1].hist(np.diff(temp_70),bins=100,log=True) +# axs[1].set_ylim(bottom=0.1,top=10E5) +# axs[1].set_xlim(left=-40, right=40) +# axs[1].annotate('70m',xy=label_coords, xycoords=label_method) +# axs[1].tick_params(labelbottom=False) + +# axs[2].hist(np.diff(temp_75),bins=100,log=True) +# axs[2].set_ylim(bottom=0.1,top=10E5) +# axs[2].set_xlim(left=-40, right=40) +# axs[2].annotate('75m',xy=label_coords, xycoords=label_method) + + + +# fig.savefig('test.pdf') + + + + + + diff --git a/ocean_dp/qc/temp_diff_timeseries_from_fv00.py b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py new file mode 100755 index 0000000..9a38ccf --- /dev/null +++ b/ocean_dp/qc/temp_diff_timeseries_from_fv00.py @@ -0,0 +1,267 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + +def last_four(entry): + + output = entry[-4::] + + return output + + +def sp_layout(num_in): + + sp_nums = np.array([1,2,4,6,9,12,16,20,25,30]) + + sp_dict={1:[1,1],2:[2,1],4:[2,2],6:[3,2],9:[3,3],12:[4,3],16:[4,4],20:[5,4],25:[5,5],30:[6,5]} + + return sp_dict[sp_nums[np.where(num_in<=sp_nums)[0][0]]] + + + +cmap = colors.ListedColormap(['blue','orange','red']) + +boundaries = [0,20,30,2000] + +norm = colors.BoundaryNorm(boundaries, cmap.N, clip=True) + + + +deployments = [] + +for x in next(os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"))[1]: + + if (('Pulse' in x) or ('SOFS' in x)): + + + deployments.append(x) + +deployments.sort(key=last_four) + + + + + + +for current_deployment in deployments: + + acceptable_files = [] + + acceptable_depths = [] + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + print('checking '+fname) + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + print('opening '+fname) + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + acceptable_files.append(os.path.join(root,fname)) + + acceptable_depths.append(nc.instrument_nominal_depth) + + print(fname+' accepted') + + nc.close() + + + acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))] + + + fig, ax = plt.subplots(sp_layout(len(acceptable_files))[0],sp_layout(len(acceptable_files))[1],figsize=((12,8)),constrained_layout=True,sharex=True) + + ax=ax.flatten() + + fig.suptitle(current_deployment, fontsize=14) + + + for fpath,f_idx in zip(acceptable_files, range(0,len(acceptable_files))): + + nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!? + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + im=ax[f_idx].scatter(nc_time,temp_extract,s=0.2,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime))),cmap=cmap,norm=norm) + + im=ax[f_idx].scatter(nc_time[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],temp_extract[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],s=0.5,c=np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))[np.concatenate((np.array([0]),np.abs(nc_dtemp_dtime)))>20],cmap=cmap,norm=norm) + + ax[f_idx].set_title(str(nc.instrument_nominal_depth)+'m',fontsize=10) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + if f_idx==0: + + fig.colorbar(im) + + for f_idx in range(len(acceptable_files),len(ax)): + + ax[f_idx].set_axis_off() + + +############################################################################## + +fig, ax = plt.subplots(sp_layout(len(deployments))[0],sp_layout(len(deployments))[1],figsize=((12,8)),constrained_layout=True,sharex=False) + +ax=ax.flatten() + +for current_deployment,d_idx in zip(deployments,range(0,len(deployments))): + + acceptable_files = [] + + acceptable_depths = [] + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + print('checking '+fname) + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + print('opening '+fname) + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + acceptable_files.append(os.path.join(root,fname)) + + acceptable_depths.append(nc.instrument_nominal_depth) + + print(fname+' accepted') + + nc.close() + + + acceptable_files = [x for _,x in sorted(zip(acceptable_depths,acceptable_files))] + + + for fpath in acceptable_files: + + nc = Dataset(fpath, mode = 'r') #mismatching files and folders?? looking for sofs7.5 file in pulse 7 folder!?!? + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time > time_deploy) & (time < time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time > time_deploy) & (time < time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + nc.close() + + ax[d_idx].hist(deployment_dtemp_dtime,21,log=True) + + ax[d_idx].set_title(current_deployment,fontsize=10) + + + + + + + + + + + + + + + + + + + + + + diff --git a/ocean_dp/qc/temp_stat_plot.py b/ocean_dp/qc/temp_stat_plot.py new file mode 100755 index 0000000..5b31a82 --- /dev/null +++ b/ocean_dp/qc/temp_stat_plot.py @@ -0,0 +1,365 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round +import pandas as pd + +import warnings +import scipy.stats as st +import statsmodels as sm +import matplotlib + + +############################# Data extraction ################################ + +# creates an empty array to store the names of the SOTS deployments +deployments = [] + +checked_files = [] + +processed_files = [] + +# loops through all the folders and files contained in the folder +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + # if the folder/file name contains 'Pulse' or 'SOFS' and doesn't contain '.', append it to deployments + if (('Pulse' in x) or ('SOFS' in x)) and ('.p' not in x): + + deployments.append(x) + +# create a dataframe to store extract information +sots_temp_ensemble = pd.DataFrame(columns = ["dTemp/dtime","dTemp/dSample","QC","Nominal depth","Deployment"]) + +# loops through all files in the directory +for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + # append the filename to the list of checked files + checked_files.append(fname) + + # for each netcdf file labelled as FV01 and containing a deployment in its name + if fname.endswith('.nc') and 'FV01' in fname and any(ele in fname for ele in deployments): + + # print the filename + print(fname) + + # open the file + nc = Dataset(os.path.join(root,fname), mode = 'a') + + # check file contains temperature data + if 'TEMP' in list(nc.variables): + + # check that the in_out_water test has been run on the file, if not run in_out_water code + if not 'TEMP_quality_control_io' in list(nc.variables): + + # run in_out_water script - uncommented at this point as just copied and pasted + var_name = 'TEMP' + nc_vars = nc.variables + to_add = [] + if var_name: + to_add.append(var_name) + else: + for v in nc_vars: + #print (vars[v].dimensions) + if v != 'TIME': + to_add.append(v) + + time_var = nc_vars["TIME"] + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + print('deployment time', time_deploy) + + print(to_add) + + # create a mask for the time range + mask = (time <= time_deploy) | (time >= time_recovery) + + for v in to_add: + if "TIME" in nc_vars[v].dimensions: + if v.endswith("_quality_control"): + print("QC time dim ", v) + + ncVarOut = nc_vars[v] + ncVarOut[mask] = 7 + else: + # create a qc variable just for this test flags + if v + "_quality_control_io" in nc.variables: + ncVarOut = nc.variables[v + "_quality_control_io"] + else: + ncVarOut = nc.createVariable(v + "_quality_control_io", "i1", nc_vars[v].dimensions, fill_value=99, zlib=True) # fill_value=0 otherwise defaults to max + ncVarOut[:] = np.zeros(nc_vars[v].shape) + ncVarOut.long_name = "quality flag for " + v + ncVarOut.flag_values = np.array([0, 1, 2, 3, 4, 6, 7, 9], dtype=np.int8) + ncVarOut.flag_meanings = 'unknown good_data probably_good_data probably_bad_data bad_data not_deployed interpolated missing_value' + + nc_vars[v].ancillary_variables = nc_vars[v].ancillary_variables + " " + v + "_quality_control_io" + ncVarOut[mask] = 7 + + nc.variables[v + "_quality_control"][:] = np.maximum(nc.variables[v + "_quality_control_io"][:],nc.variables[v + "_quality_control"][:]) + + nc.file_version = "Level 1 - Quality Controlled Data" + + + + # check that the file has a single dimension temperature vector, and that the time format is correct + if np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + # calculate temperature changes for in water data + nc_temp_diffs = np.diff(np.array(nc.variables['TEMP'][np.array(nc.variables['TEMP_quality_control'][:])!=7])) + + # extract the time data + nc_time = np.array(nc.variables['TIME'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + # Calculate time changes in hours + nc_time_hr_diffs = np.diff(nc_time_hr) + + # calculate the rate of change of temperature wrt time (degrees °C per hour) + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + + + # extract temp_qc data + nc_temp_qc = np.array(nc.variables['TEMP_quality_control'][np.array(nc.variables['TEMP_quality_control'][:])!=7]) + + # calculate qc values for each nc_dtemp_dtime by taking the maximum of the qc values of the two contributing temps + nc_dtemp_dtime_qc = pd.Series(nc_temp_qc).rolling(2).max().dropna().to_numpy() + + + + # extract sensor nominal depth + nc_nom_depth = np.array(nc.variables['NOMINAL_DEPTH']) + + # create a vector the same size as nc_dtemp_dtime with the nominal depth + nc_nom_depth_vector = np.repeat(nc_nom_depth,len(nc_dtemp_dtime)) + + + + # extract deployment name + nc_deployment = nc.deployment_code + + # create a list the same size as nc_dtemp_dtime with the deployment name + nc_deployment_list = [nc_deployment] * len(nc_dtemp_dtime) + + + + # combine information into an length x 4 dataframe + nc_temp_ensemble = pd.DataFrame({"dTemp/dtime":nc_dtemp_dtime,"dTemp/dSample":nc_temp_diffs,"QC":nc_dtemp_dtime_qc,"Nominal depth":nc_nom_depth_vector,"Deployment":nc_deployment_list}) + + # append the current netcdf's dataframe to the sots_temp_ensemble + sots_temp_ensemble = sots_temp_ensemble.append(nc_temp_ensemble) + + # append the filename to the list of processed files + processed_files.append(fname) + + + nc.close() + + +############################# Data processing ################################ + +# creates a new dataframe containing only data with QC < 3 +sots_temp_ensemble_qc210 = sots_temp_ensemble[sots_temp_ensemble["QC"]<3] + +# calculates overall standard deviation +std_time_total = np.std(sots_temp_ensemble_qc210["dTemp/dtime"]) + + + + +# creates an emply list to store data deployment by deployment +std_by_deployment_data = [] + +# creates a dict of deployment names and standard deviations +for i in sots_temp_ensemble_qc210.Deployment.unique(): + std_by_deployment_data.append( + { + 'Deployment': i, + 'STD time': np.std(sots_temp_ensemble_qc210["dTemp/dtime"][sots_temp_ensemble_qc210["Deployment"]==i]), + 'STD sample': np.std(sots_temp_ensemble_qc210["dTemp/dSample"][sots_temp_ensemble_qc210["Deployment"]==i]) + } + ) + +# creates a Dataframe from the dict +std_by_deployment = pd.DataFrame(std_by_deployment_data) + + + + + +# ============================================================================= +# std_by_depth: this function takes two compulsary arguments (top: the shallowest +# depth(m)), bottom: the deepest depth(m)) and one option argument (deployment_in: +# the deployment from which data will be taken). The function will return the standard +# deviation of the d(Temp)/d(Time) data from sensors with nominal depths at and +# between the two depths, and from only the deployment_in if specified. +# +# sample call: std_by_depth(500,10000,'SOFS-7.5-2018') +# +# this will give the std of all d(Temp)/d(Time) data from SOFS-7.5-2018 from +# sensors with 500m <= nominal depth <= 10000m +# ============================================================================= + +def std_by_depth_temp(top,bottom,deployment_in=None,rate='time'): + + selection = '' + + if rate == 'time': + + selection = "dTemp/dtime" + + elif rate == 'sample': + + selection = "dTemp/dSample" + + else: + + return "incorrect rate specification" + + + + # if user incorrectly inputs depths, swap them and run the code + if top > bottom: + + top, bottom = bottom, top + + if deployment_in == None: + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom)] + + elif isinstance(deployment_in, list): + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210.Deployment.isin(deployment_in))] + + + else: + + # subsamples sots_temp_ensemble_qc210 based on depth + target_ensemble = sots_temp_ensemble_qc210[(sots_temp_ensemble_qc210["Nominal depth"]>=top) & (sots_temp_ensemble_qc210["Nominal depth"]<=bottom) & (sots_temp_ensemble_qc210["Deployment"]==deployment_in)] + + # if not data is available with the given choices, end the function + if len(target_ensemble)==0: + + return 'No data available for those choices' + + + # calculates the mean of the subsample + target_mean = np.mean(target_ensemble[selection]) + + # calculates the standard deviation of the subsample + target_std = np.std(target_ensemble[selection]) + + # sets line thickness for plot + line_thick = 1 + + # creates axes for histogram + ax_hist=plt.axes() + + # plots a histogram of the data selected + target_ensemble.hist(column=selection,bins=100,log=True,ax=ax_hist) + + # draws lines at the mean +- 3 STD on the histogram + ax_hist.axvline(x=target_mean+3*target_std,color='r',linewidth=line_thick) + + ax_hist.axvline(x=target_mean-3*target_std,color='r',linewidth=line_thick) + + # sets the x label + if rate == 'time': + + ax_hist.set_xlabel('°C/hr') + + elif rate == 'sample': + + ax_hist.set_xlabel('°C/sample') + + + label_coords = (0.70, 0.99) + label_method = 'axes fraction' + + anno = 'mean = '+str(round(float(target_mean),sigfigs=3)) + + anno += '\n3 STD = ' + str(round(float(3*target_std),sigfigs=3)) + + anno += '\nno. samples = ' + str(len(target_ensemble)) + + anno += '\n'+str(top)+'m <= depth <= '+str(bottom)+'m' + + if deployment_in == None: + + anno += '\nall available data' + + elif isinstance(deployment_in, list): + + anno += '\n' + + anno += '\n'.join(deployment_in) + + else: + + anno += '\n'+deployment_in + + ax_hist.annotate(anno,xy=label_coords, xycoords=label_method,fontsize=8,va = "top", ha="left") + + # returns the standard deviation of the subsample + return target_std + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ocean_dp/qc/temp_time_diff_plots.py b/ocean_dp/qc/temp_time_diff_plots.py new file mode 100755 index 0000000..cf42a13 --- /dev/null +++ b/ocean_dp/qc/temp_time_diff_plots.py @@ -0,0 +1,117 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy.ma as ma +import sys +from netCDF4 import Dataset, num2date +from dateutil import parser +import numpy as np +import argparse +import glob +import pytz +import os +import matplotlib.pyplot as plt +from matplotlib import colors +from matplotlib.ticker import PercentFormatter +from sigfig import round + + + +def last_four(entry): + + output = entry[-4::] + + return output + + +deployments = [] + +for x in os.listdir("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + if ('Pulse' in x) or ('SOFS' in x): + + deployments.append(x) + +deployments.sort(key=last_four) + + +fig, ax = plt.subplots(4,4) + +ax=ax.flatten() + + +all_deployment_dtemp_dtime = [None] * len(deployments) + +for current_deployment, plt_idx in zip(deployments, range(0,len(deployments))): + + print('current deployment is '+current_deployment) + + deployment_dtemp_dtime = np.array([]) + + for root, dirs, files in os.walk("/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data"): + + for fname in files: + + if current_deployment in fname and fname.endswith('.nc') and 'FV00' in fname: + + #print(fname) #Here, the wanted file name is printed + + nc = Dataset(os.path.join(root,fname), mode = 'r') + + if 'TEMP' in nc.variables and np.array(nc.variables['TEMP'][:]).ndim == 1 and nc.variables['TIME'].getncattr('units') =='days since 1950-01-01 00:00:00 UTC': + + time_var = nc.variables["TIME"] + + time = num2date(time_var[:], units=time_var.units, calendar=time_var.calendar) + + time_deploy = parser.parse(nc.time_deployment_start, ignoretz=True) + + time_recovery = parser.parse(nc.time_deployment_end, ignoretz=True) + + #print('using '+fname) + + temp_extract = np.array(nc.variables['TEMP'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Calculate temperature changes + nc_temp_diffs = np.diff(temp_extract) + + # Extract the time data + nc_time = np.array(nc.variables['TIME'][:][(time >= time_deploy) | (time <= time_recovery)]) + + # Convert from days to hours + nc_time_hr = nc_time*24 + + ax[plt_idx].plot(nc_time,temp_extract) + + # Calculate time changes + nc_time_hr_diffs = np.diff(nc_time_hr) + + # Calculate the rate of change of temperature wrt time + nc_dtemp_dtime = np.divide(nc_temp_diffs,nc_time_hr_diffs) + + # Add the results for this netcdf to the record for the deployment + deployment_dtemp_dtime = np.concatenate((deployment_dtemp_dtime,nc_dtemp_dtime)) + + all_deployment_dtemp_dtime[plt_idx] = deployment_dtemp_dtime + + nc.close() + + + + + + + + diff --git a/ocean_dp/sots_processing_runthrough.py b/ocean_dp/sots_processing_runthrough.py new file mode 100755 index 0000000..f9aee09 --- /dev/null +++ b/ocean_dp/sots_processing_runthrough.py @@ -0,0 +1,94 @@ +# Copyright (C) 2020 Ben Weeding +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Initial package import +import sys +import glob +import fnmatch +import os +import time + +# Addition of folder containing user defined packages/modules +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/qc') +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/aggregation') +sys.path.append('/Users/tru050/Documents/GitHub/imos-tools/ocean_dp/processing') + +# Import user defined packages +import add_qc_flags +import in_out_water +import copyDataset +import pressure_interpolator + +import global_range +import rate_of_change_test +import spike_test +import flatline_test +import qc_checker + +start = time.time() + +# Set the working directory +#os.chdir('‎/Users/tru050/Desktop/cloudstor/Shared/SOTS-Temp-Raw-Data/SOFS-5-2015') + +# Make a list of FV00 filenames +fv00_files = glob.glob('*IMOS_ABOS-SOTS*FV00*.nc') + +# Run add_qc_flags.py and collect FV01 filenames +fv01_files = add_qc_flags.add_qc(fv00_files) + +# Run in_out_water.py +for ncfile in fv01_files: + + in_out_water.in_out_water(ncfile,var_name='TEMP') + +# Select pressure files using matching = fnmatch.filter(sofs75filesfv01,'*SOTS*P*_2*.nc') +pres_files = fnmatch.filter(fv01_files,'*IMOS_ABOS-SOTS*P*_2*FV01*.nc') + +# Run copyDataset.py +copyDataset.aggregate(pres_files,'PRES') + +# Run pressure_interpolator.py +fv01_pres_interp_files = pressure_interpolator.pressure_interpolator(netCDFfiles=fv01_files,agg=glob.glob('*IMOS_ABOS-SOTS*Aggregate*.nc')[0]) + +# delete the defunct FV01 files +for ncfile in fv01_files: + + os.remove(ncfile) + +# Global range test +for ncfile in fv01_pres_interp_files: + + print(ncfile) + + global_range.global_range(ncfile,'TEMP',40,-2) + +# Rate of change +rate_of_change_test.roc_test_files(fv01_pres_interp_files,'TEMP',3.36) + +# Spike +spike_test.spike_test_files(fv01_pres_interp_files,target_vars_in=['TEMP']) + +# Flatline +flatline_test.flatline_test_files(fv01_pres_interp_files,['TEMP'],window=20) + +# Check qc process has worked +fv01_qc_checked = qc_checker.qc_checker_files(fv01_pres_interp_files,['TEMP']) + +end = time.time() + +print('time elapsed: '+str(end-start)) + + +