Python_scripts/1_SICEN/sicen2_clean.py

449 lines
15 KiB
Python

from os import replace
import pandas as pd
import numpy as np
from geoalchemy2 import Geometry
# import numpy as np
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
import geopandas as gpd
# Parametres bdd
user = 'cgeier'
pwd = 'adm1n*bdCen'
adr = '91.134.194.221'
port = '5432'
base = 'sicen2'
bckup_user = 'postgres'
bckup_pwd = '1234'
bckup_adr = '192.168.0.81'
bckup_port = '5455'
bckup_base = 'sicen2'
url = URL.create('postgresql+psycopg2',
username=user,
password=pwd,
host=adr,
database=base,
)
con = create_engine(url)
bckup_url = URL.create('postgresql+psycopg2',
username=bckup_user,
password=bckup_pwd,
host=bckup_adr,
database=bckup_base,
)
bckup_con = create_engine(bckup_url)
# con = create_engine('postgresql+psycopg2://{1}:{0}@{2}:{3}/{4}'.format(pwd, user, adr, port, base), echo=False)
# bckup_con = create_engine('postgresql+psycopg2://{1}:{0}@{2}:{3}/{4}'.format(bckup_pwd, bckup_user, bckup_adr, bckup_port, bckup_base), echo=False)
def update_data(df, con, sch, tab, epsg=None):
columns = df.columns.to_list()
frame = df.copy()
frame.replace("'","''", regex=True, inplace=True)
pkey = con.dialect.get_pk_constraint(con, table_name=tab, schema=sch)['constrained_columns']
for col in frame.select_dtypes(include=['object']).columns:
test = frame[frame[col].str.contains("'", na=False)]
if not test.empty:
frame[col].replace("'", "''", inplace=True, regex=True)
# print(frame[col])
if 'geom' in columns or 'geometry' in columns:
if epsg or df.crs:
if not epsg:
epsg = df.crs.to_epsg()
name_geom = df.geometry.name
frame[name_geom] = 'SRID={epsg};'.format(epsg=epsg) + df[name_geom].map(str)
# else: return 'No crs define in update_data or in gdf'
for c, col in enumerate(columns):
if c == 0:
frame['insert'] = "('" + frame[col].map(str)
# break
else:
frame['insert'] = frame['insert'] + "','" + frame[col].map(str)
if c == len(columns)-1:
frame['insert'] = frame['insert'] + "')"
# if c == 0:
# frame['insert'] = '("' + frame[col].map(str)
# # break
# else:
# frame['insert'] = frame['insert'] + '","' + frame[col].map(str)
# if c == len(columns)-1:
# frame['insert'] = frame['insert'] + '")'
lst_cols = ', '.join(columns)
lst_vals = ','.join(frame['insert'])
lst_vals.replace("\'None\'", 'NULL', inplace=True)
lst_dupKey = ', '.join([col + '=EXCLUDED.' + col for col in columns])
lst_pkey = ','.join(pkey)
sql = '''INSERT INTO {sch}.{tab} ({lst_cols}) VALUES {lst_vals} ON CONFLICT ({lst_pkey}) DO UPDATE SET {lst_dupKey} ;'''.format(
sch=sch, tab=tab, lst_cols=lst_cols, lst_vals=lst_vals, lst_dupKey=lst_dupKey, lst_pkey=lst_pkey)
# sql = '''INSERT INTO {sch}.{tab} ({lst_cols})
# VALUES {lst_vals}
# ON CONFLICT DO NOTHING;
# '''.format(sch=sch, tab=tab, lst_cols=lst_cols, lst_vals=lst_vals)
try:
con.execute(sql)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
df = pd.read_sql_table(
table_name = 'personne',
con = con,
schema = 'md',
index_col='id_personne')
tmp = pd.DataFrame(df.nom.str.split(' ',1).tolist(), columns = ['nom','prenom'], index=df.index)
tmp = tmp[~tmp.prenom.isnull()]
df.loc[tmp.index,'prenom'] = tmp['prenom']
df.loc[tmp.index,'nom'] = tmp['nom']
df['nom'] = [nom.replace('CEN38_','') for nom in df['nom']]
df['nom'] = [nom.replace('GENTIANA_','') for nom in df['nom']]
df['nom'] = [nom.replace('LPO38_','') for nom in df['nom']]
df['nom'] = [nom.replace('LPO_','') for nom in df['nom']]
df['nom'] = [nom.replace('CENRA_','') for nom in df['nom']]
df['nom'] = [nom.replace('GRPLS_','') for nom in df['nom']]
rm_row = ['ISÈRE', 'Ecrin', 'FUR', 'BRIQUIR', 'BORGNE', '(FMBDS)', 'Isère', 'Rhône', '(Presonnel)', 'Monsieur', 'Batrachologique de France', '(Ecosphère)',
'PIC VERT Association', 'BAGOUSSE', '(BIOTOPE)', '(Dauphinelle)', 'Cras', '(GRPLS)', 'et Vie Sociale', '(ONCFS)', 'campagne sauvetage amphibiens']
df = df[~df.prenom.isin(rm_row)]
tmp = pd.DataFrame(df.prenom.str.split(' ',1).tolist(), columns = ['nom','prenom'], index=df.index)
tmp = tmp[~tmp.prenom.isnull()]
tmp.drop(index=[100032,8628,8645,4238,8058,8070,8353,1099,1081,1323,1324], inplace=True)
df.loc[tmp.index,'nom'] = df.loc[tmp.index,'nom'] + ' ' + tmp['nom']
df.loc[tmp.index,'prenom'] = tmp['prenom']
tmp = pd.DataFrame(
df.loc[df.nom=='Abbé',['nom','prenom']].prenom.str.split(' ',1).tolist(),
columns = ['prenom','nom'],
index=df.loc[df.nom=='Abbé',['nom','prenom']].index)
tmp2 = df.loc[df.nom=='Abbé',['nom','prenom']]
df.loc[tmp.index,'nom'] = tmp['nom']
df.loc[tmp.index,'prenom'] = tmp2['nom'] + ' ' + tmp['prenom']
df.reset_index(inplace=True)
tmp = df.copy()
tmp = tmp[['id_personne','nom','prenom', 'id_structure']]
# update_data(tmp,con,sch='md',tab='personne')
id_old = '8044'
id_new = '1000014'
sql='''SELECT * FROM saisie.saisie_observation WHERE observateur = '{0}' OR validateur = {0};'''.format(id_old)
# sql = 'SELECT nom, prenom FROM md.personne WHERE id_personne= 6191;'
pd.read_sql(
sql=sql,
con=con
)[['observateur','validateur']]
# df[['prenom', 'nom']].values.tolist()
# sql = '''UPDATE saisie.saisie_observation SET validateur = 100077
# WHERE validateur = 1110;'''
try:
con.execute(sql)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
tmpx = df.copy()
tmpx = tmpx[['id_personne','nom','prenom','id_structure']]
tutux = tmpx.drop_duplicates(['nom','prenom','id_structure'], keep=False)
tomx = tmpx[~tmpx.id_personne.isin(tutux.id_personne)]
tomx = tomx.sort_values('nom')
for nom in tomx.nom.unique().tolist():
tmp1 = tomx[tomx.nom==nom]
keep = max(tmp1.id_personne)
rep = min(tmp1.id_personne)
sql = '''UPDATE saisie.saisie_observation SET observateur = '{keep}' WHERE observateur = '{replace}';'''.format(
keep=keep, replace=rep
)
# sql = '''UPDATE saisie.saisie_observation SET validateur = 100077
# WHERE validateur = 1110;'''
try:
con.execute(sql)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
sql='''SELECT * FROM saisie.saisie_observation WHERE observateur = '{replace}' OR validateur = {replace};'''.format(
replace=rep
)
# sql = 'SELECT nom, prenom FROM md.personne WHERE id_personne= 6191;'
test = pd.read_sql(
sql=sql,
con=con
)[['observateur','validateur']]
if test.empty:
print(''' OK for %s'''%nom)
else:
print('ERROR !!!!!!!!!')
break
print(nom)
sql='''SELECT * FROM md.personne WHERE nom like '{nom}%';'''.format(nom=nom)
tutu = pd.read_sql(
sql=sql,
con=con
)
if tutu.shape[0] > 1:
df
print('''
{id_personne},{nom},{prenom},{id_structure}'''.format(
id_personne=id_personne,nom=nom,prenom=prenom,id_structure=id_structure))
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from shapely.geometry import Point
import matplotlib.pyplot as plt
import contextily as cx
import geopandas as gpd
import pandas as pd
# Parametres bdd
user = 'cgeier'
pwd = 'adm1n*bdCen'
adr = '91.134.194.221'
port = '5432'
base = 'sicen2'
url = URL.create('postgresql+psycopg2',
username=user,
password=pwd,
host=adr,
database=base,
)
con = create_engine(url)
df1 = pd.read_sql_table(
table_name = 'saisie_observation',
con = con,
schema = 'saisie',
index_col='id_obs')
geometry = [Point(xy) for xy in zip(df1['longitude'], df['latitude'])]
crs = 'epsg:4326'
df = gpd.GeoDataFrame(df1, crs=crs, geometry=geometry)
df = df.to_crs(epsg=3857)
df_unkeep = df.drop_duplicates(keep=False)
df_clean = df.drop_duplicates(keep='first')
df[~df.index.isin(df_clean.index)].sort_values('date_obs').index.values
df[~df.index.isin(df_unkeep.index)].sort_values('date_obs').index.values
ax = df[~df.index.isin(df_unkeep.index)][['geometry']].plot()
cx.add_basemap(ax)
plt.show()
df.sort_index(inplace=True)
data_drop = df[~df.index.isin(df.drop_duplicates(keep=False).index)]
tmp = [i.split('&') for i in df['observateur'].unique() ]
lst_pers = [item for sublist in tmp for item in sublist]
df_pers = pd.DataFrame(lst_pers, columns=['id_pers']).drop_duplicates()
sql='''SELECT * FROM md.personne WHERE id_personne IN ({ids});'''.format(ids=','.join(df_pers.id_pers))
tutu = pd.read_sql(
sql=sql,
con=con
)
df_pers['id_pers'] = df_pers['id_pers'].astype(int)
df_pers[~df_pers.id_pers.isin(tutu.id_personne)]
tom = toto.copy()
tom = tom[tom.observateur.str.contains('&')]
tom['observateur'] = [idx.replace('1043','100086') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('6191','1000002') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('6051','1000013') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1070','1000011') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1088','100043') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('6045','1000004') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('6121','1000009') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('6355','1000003') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8632','1000005') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1034','100007') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1110','100077') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1075','1000022') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1188','100038') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8271','100017') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1049','100053') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8328','100034') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1102','100062') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1016','1000001') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('1187','100037') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8018','1000008') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('4233','100067') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8254','1000012') for idx in tom['observateur']]
tom['observateur'] = [idx.replace('8015','100016') for idx in tom['observateur']]
df.sort_index(inplace=True)
tom.sort_index(inplace=True)
df.loc[df.index.isin(tom.index), 'observateur'] = tom['observateur']
df.reset_index(inplace=True)
tmp = df.copy()
tmp = tmp[['id_obs','observateur']]
update_data(tmp,con,sch='saisie',tab='saisie_observation')
# Check user in suivi_saisie_observation existent tous!
from datetime import datetime as dt
df = pd.read_sql_table(
table_name = 'suivi_saisie_observation',
con = con,
schema = 'saisie',
)
df.sort_values('date_operation', ascending=False, inplace=True)
x = '2021-05-25 22:00:00'
y = '2021-05-21 12:30:00'
x = dt.strptime(x, '%Y-%m-%d %H:%M:%S')
y = dt.strptime(y, '%Y-%m-%d %H:%M:%S')
ddf = df[(df.date_operation < x) & (df.date_operation > y)]
ddf = ddf[['operation','utilisateur', 'id_obs', 'observateur', 'validateur', 'numerisateur']]
# Get all "id" observateur
tmp_obs = [i.split('&') for i in ddf['observateur'].dropna().unique() ]
lst_pers_obs = [item for sublist in tmp_obs for item in sublist]
tmp_val = [i.split('&') for i in ddf['validateur'].dropna().unique().astype(int).astype(str) ]
lst_pers_val = [item for sublist in tmp_val for item in sublist]
tmp_num = [i.split('&') for i in ddf['numerisateur'].dropna().unique().astype(str) ]
lst_pers_num = [item for sublist in tmp_num for item in sublist]
lst_pers = lst_pers_obs + lst_pers_val + lst_pers_num
df_pers = pd.DataFrame(lst_pers, columns=['id_pers']).drop_duplicates()
# Check if personnes exist
pers_not_exist = []
for i in df_pers.id_pers:
sql = 'SELECT id_personne,prenom, nom, id_structure FROM md.personne WHERE id_personne = %s;'%i
res = pd.read_sql(
sql = sql,
con = con
)
if res.empty:
print(i)
pers_not_exist += [i]
sql = '''SELECT * FROM saisie.suivi_saisie_observation
WHERE utilisateur = 'inconnu'
AND date_operation < '2021-05-25 22:00:00'
AND date_operation > '2021-05-21 12:30:00';'''
pd.read_sql(
sql=sql,
con=con
)
sql = '''UPDATE saisie.suivi_saisie_observation SET utilisateur = 'colas.geier@cen-isere.org'
WHERE utilisateur = 'inconnu'
AND date_operation < '2021-05-25 22:00:00'
AND date_operation > '2021-05-21 12:30:00';'''
try:
con.execute(sql)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
rep = '8044'
keep = '1000014'
sql_check ='''SELECT * FROM saisie.saisie_observation WHERE observateur like '%{replace}%' OR validateur = {replace};'''.format(
replace=rep
)
sql_check ='''SELECT * FROM saisie.saisie_observation WHERE observateur = '{replace}' OR validateur = {replace};'''.format(
replace=rep
)
sql_updt = '''
UPDATE saisie.saisie_observation SET observateur = '{keep}' WHERE observateur = '{replace}';
UPDATE saisie.saisie_observation SET numerisateur = {keep} WHERE numerisateur = {replace};
'''.format(
keep=keep, replace=rep
)
# sql = '''UPDATE saisie.saisie_observation SET validateur = 100077
# WHERE validateur = 1110;'''
test = pd.read_sql(
sql=sql_check,
con=con
)[['observateur','validateur']]
if test.empty:
print(''' OK for %s in table saisie'''%nom)
else:
try:
con.execute(sql_updt)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
sql_check2 ='''SELECT * FROM saisie.suivi_saisie_observation WHERE observateur = '{replace}' OR validateur = '{replace}';'''.format(
replace=rep
)
sql_updt2 = '''
UPDATE saisie.suivi_saisie_observation SET observateur = '{keep}' WHERE observateur = '{replace}';
UPDATE saisie.suivi_saisie_observation SET numerisateur = {keep} WHERE numerisateur = {replace};
'''.format(
keep=keep, replace=rep
)
test2 = pd.read_sql(
sql=sql_check2,
con=con
)[['observateur','validateur']]
if test2.empty:
print(''' OK for %s in table saisie'''%nom)
else:
try:
con.execute(sql_updt2)
# con.execute(text(sql))
print('''
Update OK !''')
except Exception as exept:
print(exept)
sql='''SELECT * FROM saisie.saisie_observation WHERE observateur = '{replace}' OR validateur = {replace};'''.format(
replace=rep
)
# sql = 'SELECT nom, prenom FROM md.personne WHERE id_personne= 6191;'
test = pd.read_sql(
sql=sql,
con=con
)[['observateur','validateur']]
if test.empty:
print(''' OK for %s'''%nom)
else:
print('ERROR !!!!!!!!!')
# break