DocScoDoc/tools/anonymize_db.py

161 lines
5.2 KiB
Python
Raw Normal View History

2021-09-18 21:59:54 +02:00
#!/opt/scodoc/venv/bin/python
2020-09-26 16:19:37 +02:00
# -*- coding: utf-8 -*-
# -*- mode: python -*-
##############################################################################
#
# Gestion scolarite IUT
#
2023-01-02 13:16:27 +01:00
# Copyright (c) 1999 - 2023 Emmanuel Viennet. All rights reserved.
2020-09-26 16:19:37 +02:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Emmanuel Viennet emmanuel.viennet@viennet.net
#
##############################################################################
"""Anonymize une base de données ScoDoc
Runned as user "scodoc" with scodoc and postgresql up.
2020-09-26 16:19:37 +02:00
E. Viennet, Jan 2019
"""
2021-07-09 18:49:16 +02:00
import psycopg2
import sys
import traceback
2020-09-26 16:19:37 +02:00
def log(msg):
sys.stdout.flush()
2021-07-09 18:49:16 +02:00
sys.stderr.write(msg + "\n")
2020-09-26 16:19:37 +02:00
sys.stderr.flush()
2023-02-09 12:29:46 +01:00
def usage():
sys.stdout.flush()
sys.stderr.flush()
print(f"Usage: {sys.argv[0]} [--users] dbname", file=sys.stderr)
sys.exit(1)
2020-09-26 16:19:37 +02:00
# --- Fonctions d'Anonymisation, en SQL
anonymize_name = "random_text_md5(8)"
anonymize_date = "'1970-01-01'"
anonymize_question_str = "'?'"
anonymize_null = "NULL"
2021-08-15 16:43:53 +02:00
# --- Champs à anonymiser (cette configuration pourrait être placé dans
# un fichier séparé et le code serait alors générique pour toute base
# posgresql.
#
2020-09-26 16:19:37 +02:00
# On essaie de retirer les données personnelles des étudiants et des entreprises
2021-08-15 16:43:53 +02:00
#
2020-09-26 16:19:37 +02:00
#
ANONYMIZED_FIELDS = {
2021-07-09 18:49:16 +02:00
"identite.nom": anonymize_name,
"identite.prenom": anonymize_name,
2021-09-18 21:59:54 +02:00
"identite.nom_usuel": anonymize_null,
"identite.civilite": "'X'",
2021-07-09 18:49:16 +02:00
"identite.date_naissance": anonymize_date,
"identite.lieu_naissance": anonymize_question_str,
2021-09-18 21:59:54 +02:00
"identite.dept_naissance": anonymize_question_str,
2021-07-09 18:49:16 +02:00
"identite.nationalite": anonymize_question_str,
2021-09-18 21:59:54 +02:00
"identite.statut": anonymize_null,
"identite.boursier": anonymize_null,
"identite.photo_filename": anonymize_null,
2021-07-09 18:49:16 +02:00
"identite.code_nip": anonymize_null,
"identite.code_ine": anonymize_null,
2021-09-18 21:59:54 +02:00
"identite.scodoc7_id": anonymize_null,
2021-07-09 18:49:16 +02:00
"adresse.email": "'ano@nyme.fr'",
"adresse.emailperso": anonymize_null,
"adresse.domicile": anonymize_null,
2021-09-18 21:59:54 +02:00
"adresse.codepostaldomicile": anonymize_null,
"adresse.villedomicile": anonymize_null,
"adresse.paysdomicile": anonymize_null,
2021-07-09 18:49:16 +02:00
"adresse.telephone": anonymize_null,
"adresse.telephonemobile": anonymize_null,
"adresse.fax": anonymize_null,
2021-09-18 21:59:54 +02:00
"admissions.nomlycee": anonymize_name,
2021-07-09 18:49:16 +02:00
"billet_absence.description": anonymize_null,
"etud_annotations.comment": anonymize_name,
"notes_appreciations.comment": anonymize_name,
}
2020-09-26 16:19:37 +02:00
def anonymize_column(cursor, tablecolumn):
"""Anonymise une colonne
tablecolumn est de la forme nom_de_table.nom_de_colonne, par exemple "identite.nom"
key_name est le nom de la colonne (clé) à utiliser pour certains remplacements
2021-07-09 18:49:16 +02:00
(cette clé doit être anonyme et unique). Par exemple, un nom propre pourrait être
2020-09-26 16:19:37 +02:00
remplacé par nom_valeur_de_la_clé.
2021-07-09 18:49:16 +02:00
"""
table, column = tablecolumn.split(".")
2023-02-09 12:29:46 +01:00
anonymized = ANONYMIZED_FIELDS[tablecolumn]
log(f"processing {tablecolumn}")
cursor.execute(f"UPDATE {table} SET {column} = {anonymized};")
def anonymize_users(cursor):
"""Anonymise la table utilisateurs"""
log("processing user table")
cursor.execute("""UPDATE "user" SET email = 'x@y.fr';""")
cursor.execute("""UPDATE "user" SET password_hash = '*';""")
cursor.execute("""UPDATE "user" SET password_scodoc7 = NULL;""")
cursor.execute("""UPDATE "user" SET date_created = '2001-01-01';""")
cursor.execute("""UPDATE "user" SET date_expiration = '2201-12-31';""")
cursor.execute("""UPDATE "user" SET token = NULL;""")
cursor.execute("""UPDATE "user" SET token_expiration = NULL;""")
cursor.execute("""UPDATE "user" SET nom=CONCAT('nom_', id);""")
cursor.execute("""UPDATE "user" SET prenom=CONCAT('nom_', id);""")
2021-07-09 18:49:16 +02:00
2020-09-26 16:19:37 +02:00
def anonymize_db(cursor):
2021-07-09 18:49:16 +02:00
"""Traite, une à une, les colonnes indiquées dans ANONYMIZED_FIELDS"""
2020-09-26 16:19:37 +02:00
for tablecolumn in ANONYMIZED_FIELDS:
anonymize_column(cursor, tablecolumn)
2023-02-09 12:29:46 +01:00
process_users = False
if len(sys.argv) < 2 or len(sys.argv) > 3:
usage()
if len(sys.argv) > 2:
if sys.argv[1] != "--users":
usage()
dbname = sys.argv[2]
process_users = True
else:
dbname = sys.argv[1]
2020-09-26 16:19:37 +02:00
2023-02-09 12:29:46 +01:00
log(f"\nAnonymizing database {dbname}")
2021-07-09 18:49:16 +02:00
cnx_string = "dbname=" + dbname
2020-09-26 16:19:37 +02:00
try:
2021-07-09 18:49:16 +02:00
cnx = psycopg2.connect(cnx_string)
2023-02-09 12:29:46 +01:00
except Exception as e:
log(f"\n*** Error: can't connect to database {dbname} ***\n")
log(f"""connexion string was "{cnx_string}" """)
2020-09-26 16:19:37 +02:00
traceback.print_exc()
cnx.set_session(autocommit=False)
cursor = cnx.cursor()
anonymize_db(cursor)
2023-02-09 12:29:46 +01:00
if process_users:
anonymize_users(cursor)
2020-09-26 16:19:37 +02:00
cnx.commit()
cnx.close()