Commit ebf64e9a authored by LIBES Maurice's avatar LIBES Maurice 🏋🏾
Browse files

rajout fonction qui compare les entetes des colonnes des fichiers

CSV present dans le repertoire CsvFiles
parent d1f9fd33
No preview for this file type
No preview for this file type
#!/usr/bin/python3
# -*- coding: utf-8 -*- ## important d'avoir ca en 2eme ligne pour les accents é et apostrphophe
#
# traite_ctd_netcdf.py
#
# Copyright Aout 2018 Maurice Libes <maurice.libes@osupytheas.fr>
#
# Objet: convertit tout type de fichiers CSV en fichier NetCDF
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import copy
import netCDF4 as nc
import pandas as pd
......@@ -13,7 +39,7 @@ from termcolor import cprint
def Convert_Datenum():
dateHeure = []
for dateiso in csvdata["Date"]:
for dateiso in csvdata["DateTime"]:
#print("dateiso = ",dateiso)
try:
date,heure =dateiso.split('T')
......@@ -23,7 +49,7 @@ def Convert_Datenum():
datenum = nc.date2num(dateobs, units=NC_TIME_FMT)
dateHeure.append(datenum)
except:
print("La date attendue n'est pas au format ISO yyymmddThh:mm:ss ",dateiso)
cprint("La date attendue n'est pas au format ISO yyymmddThh:mm:ss "+dateiso, color='red', attrs=['bold'])
sys.exit(0)
#print("tableau dateHeure ",dateHeure)
......@@ -55,6 +81,7 @@ def Get_dimension():
##
def Index_selector(index):
#Selectionne une ligne du fichier csv par son nom d'index
liste_att_obligatoires=['unites','standard_name','long_name','dimension','cf_role']
if(index == 'unites'):
final = csvheader.loc[index]
elif(index == 'standard_name'):
......@@ -64,48 +91,13 @@ def Index_selector(index):
elif(index == 'dimension'):
final = csvheader.loc[index]
elif(index == 'Station'):
final = csvheader.loc[index,'Station_name']
print("final ",final)
final = csvheader.loc[index,'Station']
#print("final ",final)
return final
##
def Index_Selector_Metadata(index):
# Selectionne une ligne du fichier csv metadata par son nom d'index et on prend la valeur en premiere position 0
if (index == 'rfa'):
val_att = glob_att.loc[index].values[0]
elif (index == 'license'):
val_att = glob_att.loc[index].values[0]
elif (index == 'summary'):
val_att = glob_att.loc[index].values[0]
elif (index == 'lineage'):
val_att = glob_att.loc[index].values[0]
elif (index == 'project'):
val_att = glob_att.loc[index].values[0]
elif (index == 'description'):
val_att = glob_att.loc[index].values[0]
elif (index == 'title'):
val_att = glob_att.loc[index].values[0]
elif (index == 'keywords'):
val_att = glob_att.loc[index].values[0]
elif (index == 'comment'):
val_att = glob_att.loc[index].values[0]
elif (index == 'history'):
val_att = glob_att.loc[index].values[0]
elif (index == 'featuretype'):
val_att = glob_att.loc[index].values[0]
elif (index == 'cdm_data_type'):
val_att = glob_att.loc[index].values[0]
elif (index == 'cdm_timeserie_variables' or index =='cdm_profile_variables' or 'cdm_trajectory_variables'):
val_att = glob_att.loc[index].values[0]
elif (index == 'keywords'):
val_att = glob_att.loc[index].values[0]
else:
cprint("l'index que vous avez mis n'existe pas dans le fichier csv metadata", color=['red'], attrs=['bold'])
return val_att
##
def Get_Formats():
#retourne le format (type) de chaque valeur des colonnes du fichier CSV
......@@ -154,17 +146,16 @@ def Create_Write_Variables(dim, csvfile):
# déterminer le type des variables (int, float, str...)
frmt=Get_Formats()
print("sortie frmt ", frmt)
# recupere le nom des colonnes
#print("sortie frmt ", frmt)
# convertit les dates ISO 2011-01-16T09:38:00 en format numérique pour NetCDF
dateHeure=Convert_Datenum()
#print(dateHeure)
i = 0 # index de colonnes
for col in cols:
print("format ",frmt[i] )
print("col ",col)
if (col == "Date"):
#print("col ",col)
if (col == "DateTime"):
frmt[i]='i4' #on force la date en Int car on l'a convertie en Int avec la fonction Convert_Datenum()
units[i] = NC_TIME_FMT
......@@ -177,6 +168,7 @@ def Create_Write_Variables(dim, csvfile):
## tabcol.toto = toto[i]
# attributs standards pour toutes les variables
#ncFile.setncattr('st',142)
tabcol.units = units[i]
tabcol.long_name = long_name[i]
tabcol.standard_name = standard_name[i]
......@@ -189,28 +181,22 @@ def Create_Write_Variables(dim, csvfile):
if (col=="Depth" or col=="Profondeur"):
tabcol.axis="Z"
## cas des valeurs en chaines de caracteres : elles sont traitées différement
## cas des valeurs en chaines de caracteres : elles sont traitées différement pour Netcdf
if ( frmt[i][0] =='S'):
value=csvdata[col]
print("valeur ",value.values[0])
l=len(value.values[0])
print("** longueur tableau: ",len(value),"long chaine ",l)
#value=csvdata[col]
#print("valeur ",value.values[0])
#print("** longueur tableau: ",len(value),"long chaine ",l)
tabcol._Encoding = 'ascii'
datain = np.array(csvdata[col].values,dtype=frmt[i])
print("data IN ",datain)
string_values = np.array(csvdata[col].values,dtype=frmt[i])
## ecriture des valeurs dans les tableaux
if (col=='Date'):
## ecriture des colonnes de valeurs CSV dans les tableaux tabcol[] des variables de Netcdf
if (col=='DateTime'):
tabcol[:] = dateHeure# cas particulier ecriture des valeurs de temps dans la variable NetCDF
tabcol.origin = NC_TIME_ORIGIN
else:
print("*** ",csvdata[col].values)
#print("*** ",csvdata[col].values)
if ( frmt[i][0] =='S'):
tabcol[:] = datain
tabcol[:] = string_values
else:
tabcol[:] = csvdata[col].tolist() # on ecrit les valeurs des colonnes du fichier CSV dans le tableau de la variable NetCDF
......@@ -222,7 +208,6 @@ def Create_Write_Variables(dim, csvfile):
##
def Create_Variable_Station():
#Creation de la variable station name (cas particulier|fausse dimension)
station_name = ncFile.createVariable('station_name', 'S1', 'lenstation')
station_name.long_name = "station_name"
station_name.cf_role = featuretype+"_id"
......@@ -234,47 +219,29 @@ def Create_Variable_Station():
##
def Create_Global_Attributes(ncFile):
# Creation des global attributs/metadata du fichier NC lues dans un fichier CSV externe
ncFile.rfa = Index_Selector_Metadata('rfa') #glob_att.loc[index].to_string()
ncFile.license = Index_Selector_Metadata('license')
ncFile.summary = Index_Selector_Metadata('summary')
ncFile.lineage = Index_Selector_Metadata('lineage')
ncFile.project = Index_Selector_Metadata('project')
ncFile.description = Index_Selector_Metadata('description')
ncFile.title = Index_Selector_Metadata('title')
ncFile.keywords = Index_Selector_Metadata('keywords')
ncFile.comments = Index_Selector_Metadata('comment')
ncFile.history = Index_Selector_Metadata('history')
ncFile.featuretype = featuretype
ncFile.cdm_data_type = featuretype
ncFile.conventions = Index_Selector_Metadata('conventions')
#traite le cas particulier featuretype
if featuretype =='timeserie':
ncFile.cdm_timeserie_variables = Index_Selector_Metadata('cdm_timeserie_variables')
elif featuretype =='profile':
ncFile.cdm_profile_variables = Index_Selector_Metadata('cdm_profile_variables')
elif featuretype =='trajectory':
ncFile.cdm_trajectory_variables = Index_Selector_Metadata('cdm_trajectory_variables')
else:
cprint("Erreur le featuretype est mal defini ", color='red', attrs=['bold'])
sys.exit(0)
indexlist = glob_att.index
for index in indexlist:
value = glob_att.loc[index].values[0]
if index == 'Conventions':
ncFile.setncattr(index, value)
else:
ncFile.setncattr(index.casefold(), value)
cprint('Global Attributs OK', color='green', attrs=['bold'])
return 0
##
def Control_Global_Attributes():
#Test s'il existe bien dans notre ficher csv metadata les 4 colonnes importante featuretype|conventions...
gb_att = glob_att.index
indexlist = glob_att.index
#print("feature type ",featuretype)
if ( ('featuretype' in gb_att) and ('cdm_data_type' in gb_att) and ('conventions' in gb_att) ):
if ( ('featuretype' in indexlist) and ('cdm_data_type' in indexlist) and ('Conventions' in indexlist) ):
cprint("Control Attributes OK", color='green', attrs=['bold'])
if (featuretype=="timeserie"):
if ('cdm_timeserie_variables' not in gb_att):
if ('cdm_timeserie_variables' not in indexlist):
cprint("il manque cdm_timeserie_variables", color='red', attrs=['bold'])
sys.exit(0)
elif (featuretype=="profile"):
if ('cdm_profile_variables' not in gb_att):
if ('cdm_profile_variables' not in indexlist):
cprint("il manque cdm_profile_variables", color='red', attrs=['bold'])
sys.exit(0)
else:
......@@ -297,12 +264,23 @@ def Control_Illegal_char():
##
def Control_Header_Columns():
#Controle si les columns date|latitude|longitude existe bien dans le fichier csv
if ('Date' in cols) and ('Latitude' in cols) and ('Longitude' in cols):
if ('DateTime' in cols) and ('Latitude' in cols) and ('Longitude' in cols):
cprint("Control Columns OK", color='green', attrs=['bold'])
else:
cprint("Control failed!\nIl manque la Date|Latitude|Longitude dans votre fichier :"+fic, color='red', attrs=['bold'])
cprint("Control failed!\nIl manque une des colonnes DateTime|Latitude|Longitude dans votre fichier :"+fic, color='red', attrs=['bold'])
sys.exit(0)
return 0
##
def Compare_Files(cols,firstcols):
if (cols.all() != firstcols.all()):
print('First cols',firstcols)
print('\ncols',cols)
cprint("Les fichiers CSV dans le repertoire CsvFiles sont differents !!", color='red',attrs=['bold'])
cprint("Le fichier global_attributes.csv ne concerne que des fichiers identiques !!", color='red',attrs=['bold'])
sys.exit(0)
return 0
### Programme Principal
###
......@@ -313,9 +291,8 @@ if __name__ == "__main__":
global stationname, lenstation, OutputPath
global glob_att,featuretype
global ncFile,fic
global cols
global cols,indexlist
NC_TIME_FMT = 'seconds since 1970-01-01 00:00:00 UTC'
NC_TIME_ORIGIN = "01-JAN-1970 00:00:00"
......@@ -332,6 +309,7 @@ if __name__ == "__main__":
args = parser.parse_args()
allfiles = os.listdir(InputPath)
firstcols=[]; nbfic=0
for fic in allfiles:
if fic.endswith(".csv"):
cprint("Traitement fichier "+fic, color='blue', attrs=['bold'])
......@@ -343,20 +321,32 @@ if __name__ == "__main__":
else:
stationname = args.name
lenstation = len(stationname)
csvheader = pd.read_csv(InputPath+fic, sep=args.separator,index_col=0)
indexdrop=csvheader.index.dropna() #cette ligne selectionne le header en supprimant toute la valeur vide(NAN)
csvheader = pd.read_csv(InputPath+fic, low_memory=False, sep=args.separator,index_col=0)
nbfic+=1
#print('test',csvheader)
indexdrop=csvheader.index.dropna() #cette ligne selectionne le header en supprimant toute les valeurs vides (NAN)
csvdata=pd.read_csv(InputPath+fic, sep=args.separator,skiprows=range(1,len(indexdrop)+1),index_col=0)
print('test:\n',csvdata)
## on extrait le nom des colonnes du fichier CSV
cols = csvdata.columns.values
#print(" cols ",cols)
#print('cols',cols, 'type ',type(cols))
if (nbfic > 1):
Compare_Files(cols,firstcols)
firstcols = cols
Control_Header_Columns()
Control_Illegal_char()
glob_att = pd.read_csv(fileMetaData, delimiter='=', header=None, index_col=0)
# retourne le type de donnees netcdf : timeserie ou profile ou trajectory
featuretype = Index_Selector_Metadata('featuretype')
indexlist = glob_att.index
# retourne le type de donnees netcdf : timeserie ou profile ou trajectory
featuretype = glob_att.loc['featuretype'].values[0]
Control_Global_Attributes()
#
dimension = Get_dimension()
......@@ -370,4 +360,4 @@ if __name__ == "__main__":
Create_Write_Variables(dimension, fic)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment