2021-09-18 21:59:54 +02:00
|
|
|
#!/opt/scodoc/venv/bin/python
|
2020-09-26 16:19:37 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- mode: python -*-
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
#
|
|
|
|
# Gestion scolarite IUT
|
|
|
|
#
|
2023-12-31 23:04:06 +01:00
|
|
|
# Copyright (c) 1999 - 2024 Emmanuel Viennet. All rights reserved.
|
2020-09-26 16:19:37 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
#
|
|
|
|
# Emmanuel Viennet emmanuel.viennet@viennet.net
|
|
|
|
#
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
"""Anonymize une base de données ScoDoc
|
|
|
|
|
2021-07-21 22:32:30 +02:00
|
|
|
Runned as user "scodoc" with scodoc and postgresql up.
|
2020-09-26 16:19:37 +02:00
|
|
|
|
2024-03-01 11:12:36 +01:00
|
|
|
Travaille entièrement au niveau SQL, n'utilise aucun modèle SQLAlchemy.
|
2020-09-26 16:19:37 +02:00
|
|
|
|
2024-03-01 11:12:36 +01:00
|
|
|
E. Viennet, Jan 2019, Fev 2024
|
|
|
|
"""
|
|
|
|
import random
|
2021-07-09 18:49:16 +02:00
|
|
|
import sys
|
|
|
|
import traceback
|
2024-03-01 11:12:36 +01:00
|
|
|
import psycopg2
|
|
|
|
from psycopg2 import extras
|
2024-06-03 15:14:27 +02:00
|
|
|
import urllib.parse
|
|
|
|
import re
|
2021-07-09 18:49:16 +02:00
|
|
|
|
2020-09-26 16:19:37 +02:00
|
|
|
|
|
|
|
def log(msg):
|
|
|
|
sys.stdout.flush()
|
2021-07-09 18:49:16 +02:00
|
|
|
sys.stderr.write(msg + "\n")
|
2020-09-26 16:19:37 +02:00
|
|
|
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
2023-02-09 12:29:46 +01:00
|
|
|
def usage():
|
|
|
|
sys.stdout.flush()
|
|
|
|
sys.stderr.flush()
|
|
|
|
print(f"Usage: {sys.argv[0]} [--users] dbname", file=sys.stderr)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
2020-09-26 16:19:37 +02:00
|
|
|
# --- Fonctions d'Anonymisation, en SQL
|
|
|
|
|
|
|
|
anonymize_name = "random_text_md5(8)"
|
|
|
|
anonymize_date = "'1970-01-01'"
|
2023-11-08 12:51:24 +01:00
|
|
|
anonymize_false = "FALSE"
|
2020-09-26 16:19:37 +02:00
|
|
|
anonymize_question_str = "'?'"
|
|
|
|
anonymize_null = "NULL"
|
|
|
|
|
2024-03-01 11:12:36 +01:00
|
|
|
# --- Listes de noms et prénoms pour remplacer les identités
|
|
|
|
NOMS = [
|
|
|
|
x.strip()
|
|
|
|
for x in open("/opt/scodoc/tools/fakeportal/nomsprenoms/noms.txt", encoding="utf8")
|
|
|
|
]
|
|
|
|
PRENOMS = [
|
|
|
|
x.strip()
|
|
|
|
for x in open(
|
|
|
|
"/opt/scodoc/tools/fakeportal/nomsprenoms/prenoms.txt", encoding="utf8"
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
2021-08-15 16:43:53 +02:00
|
|
|
# --- Champs à anonymiser (cette configuration pourrait être placé dans
|
|
|
|
# un fichier séparé et le code serait alors générique pour toute base
|
2024-03-01 11:12:36 +01:00
|
|
|
# postgresql.
|
2021-08-15 16:43:53 +02:00
|
|
|
#
|
2020-09-26 16:19:37 +02:00
|
|
|
# On essaie de retirer les données personnelles des étudiants et des entreprises
|
2021-08-15 16:43:53 +02:00
|
|
|
#
|
2020-09-26 16:19:37 +02:00
|
|
|
#
|
|
|
|
ANONYMIZED_FIELDS = {
|
2021-07-09 18:49:16 +02:00
|
|
|
"identite.nom": anonymize_name,
|
|
|
|
"identite.prenom": anonymize_name,
|
2021-09-18 21:59:54 +02:00
|
|
|
"identite.nom_usuel": anonymize_null,
|
2023-12-31 23:04:06 +01:00
|
|
|
"identite.civilite_etat_civil": anonymize_null,
|
|
|
|
"identite.prenom_etat_civil": anonymize_null,
|
2021-07-09 18:49:16 +02:00
|
|
|
"identite.date_naissance": anonymize_date,
|
|
|
|
"identite.lieu_naissance": anonymize_question_str,
|
2021-09-18 21:59:54 +02:00
|
|
|
"identite.dept_naissance": anonymize_question_str,
|
2021-07-09 18:49:16 +02:00
|
|
|
"identite.nationalite": anonymize_question_str,
|
2021-09-18 21:59:54 +02:00
|
|
|
"identite.statut": anonymize_null,
|
2023-11-08 12:51:24 +01:00
|
|
|
"identite.boursier": anonymize_false,
|
2021-09-18 21:59:54 +02:00
|
|
|
"identite.photo_filename": anonymize_null,
|
2021-07-09 18:49:16 +02:00
|
|
|
"identite.code_nip": anonymize_null,
|
|
|
|
"identite.code_ine": anonymize_null,
|
2021-09-18 21:59:54 +02:00
|
|
|
"identite.scodoc7_id": anonymize_null,
|
2021-07-09 18:49:16 +02:00
|
|
|
"adresse.email": "'ano@nyme.fr'",
|
|
|
|
"adresse.emailperso": anonymize_null,
|
|
|
|
"adresse.domicile": anonymize_null,
|
2021-09-18 21:59:54 +02:00
|
|
|
"adresse.codepostaldomicile": anonymize_null,
|
|
|
|
"adresse.villedomicile": anonymize_null,
|
|
|
|
"adresse.paysdomicile": anonymize_null,
|
2021-07-09 18:49:16 +02:00
|
|
|
"adresse.telephone": anonymize_null,
|
|
|
|
"adresse.telephonemobile": anonymize_null,
|
|
|
|
"adresse.fax": anonymize_null,
|
2021-09-18 21:59:54 +02:00
|
|
|
"admissions.nomlycee": anonymize_name,
|
2021-07-09 18:49:16 +02:00
|
|
|
"billet_absence.description": anonymize_null,
|
|
|
|
"etud_annotations.comment": anonymize_name,
|
|
|
|
"notes_appreciations.comment": anonymize_name,
|
|
|
|
}
|
|
|
|
|
2020-09-26 16:19:37 +02:00
|
|
|
|
|
|
|
def anonymize_column(cursor, tablecolumn):
|
|
|
|
"""Anonymise une colonne
|
|
|
|
tablecolumn est de la forme nom_de_table.nom_de_colonne, par exemple "identite.nom"
|
|
|
|
key_name est le nom de la colonne (clé) à utiliser pour certains remplacements
|
2021-07-09 18:49:16 +02:00
|
|
|
(cette clé doit être anonyme et unique). Par exemple, un nom propre pourrait être
|
2020-09-26 16:19:37 +02:00
|
|
|
remplacé par nom_valeur_de_la_clé.
|
2021-07-09 18:49:16 +02:00
|
|
|
"""
|
|
|
|
table, column = tablecolumn.split(".")
|
2023-02-09 12:29:46 +01:00
|
|
|
anonymized = ANONYMIZED_FIELDS[tablecolumn]
|
|
|
|
log(f"processing {tablecolumn}")
|
|
|
|
cursor.execute(f"UPDATE {table} SET {column} = {anonymized};")
|
|
|
|
|
|
|
|
|
2024-03-01 11:12:36 +01:00
|
|
|
def rename_students(cursor):
|
|
|
|
"""Remet des noms/prenoms fictifs aux étuduiants"""
|
|
|
|
# Change les noms/prenoms
|
|
|
|
cursor.execute("""SELECT * FROM "identite";""")
|
|
|
|
etuds = cursor.fetchall()
|
|
|
|
for etud in etuds:
|
|
|
|
nom, prenom = random.choice(NOMS), random.choice(PRENOMS)
|
|
|
|
cursor.execute(
|
|
|
|
"""UPDATE "identite"
|
|
|
|
SET nom=%(nom)s, prenom=%(prenom)s
|
|
|
|
WHERE id=%(id)s
|
|
|
|
""",
|
|
|
|
{
|
|
|
|
"id": etud["id"],
|
|
|
|
"nom": nom,
|
|
|
|
"prenom": prenom,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-02-09 12:29:46 +01:00
|
|
|
def anonymize_users(cursor):
|
|
|
|
"""Anonymise la table utilisateurs"""
|
|
|
|
log("processing user table")
|
|
|
|
cursor.execute("""UPDATE "user" SET email = 'x@y.fr';""")
|
|
|
|
cursor.execute("""UPDATE "user" SET password_hash = '*';""")
|
|
|
|
cursor.execute("""UPDATE "user" SET password_scodoc7 = NULL;""")
|
|
|
|
cursor.execute("""UPDATE "user" SET date_created = '2001-01-01';""")
|
|
|
|
cursor.execute("""UPDATE "user" SET date_expiration = '2201-12-31';""")
|
|
|
|
cursor.execute("""UPDATE "user" SET token = NULL;""")
|
|
|
|
cursor.execute("""UPDATE "user" SET token_expiration = NULL;""")
|
2024-03-01 11:12:36 +01:00
|
|
|
# Change les noms/prenoms/mail
|
2024-04-12 01:04:27 +02:00
|
|
|
cursor.execute("""SELECT * FROM "user" WHERE user_name <> 'admin';""")
|
2024-03-01 11:12:36 +01:00
|
|
|
users = cursor.fetchall() # fetch tout car modifie cette table ds la boucle
|
2024-04-11 01:45:25 +02:00
|
|
|
nb_users = len(users)
|
2024-03-01 11:12:36 +01:00
|
|
|
used_user_names = {u["user_name"] for u in users}
|
2024-04-11 01:45:25 +02:00
|
|
|
for i, user in enumerate(users):
|
2024-03-01 11:12:36 +01:00
|
|
|
user_name = user["user_name"]
|
|
|
|
nom, prenom = random.choice(NOMS), random.choice(PRENOMS)
|
|
|
|
new_name = (prenom[0] + nom).lower()
|
|
|
|
# unique ?
|
|
|
|
while new_name in used_user_names:
|
|
|
|
new_name += "x"
|
|
|
|
used_user_names.add(new_name)
|
2024-04-11 01:45:25 +02:00
|
|
|
print(f"{i}/{nb_users}\t{user_name} > {new_name}")
|
2024-03-01 11:12:36 +01:00
|
|
|
cursor.execute(
|
|
|
|
"""UPDATE "user"
|
|
|
|
SET nom=%(nom)s, prenom=%(prenom)s, email=%(email)s, user_name=%(new_name)s
|
|
|
|
WHERE id=%(id)s
|
|
|
|
""",
|
|
|
|
{
|
|
|
|
"email": f"{prenom}.{nom}@ano.nyme",
|
|
|
|
"id": user["id"],
|
|
|
|
"nom": nom,
|
|
|
|
"prenom": prenom,
|
|
|
|
"new_name": new_name,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
# Change les username: utilisés en référence externe
|
|
|
|
# dans diverses tables:
|
|
|
|
for table, field in (
|
|
|
|
("etud_annotations", "author"),
|
|
|
|
("scolog", "authenticated_user"),
|
|
|
|
("scolar_news", "authenticated_user"),
|
|
|
|
("notes_appreciations", "author"),
|
|
|
|
("are_historique", "authenticated_user"),
|
|
|
|
):
|
|
|
|
cursor.execute(
|
|
|
|
f"""UPDATE "{table}"
|
|
|
|
SET {field}=%(new_name)s
|
|
|
|
WHERE {field}=%(user_name)s
|
|
|
|
""",
|
|
|
|
{
|
|
|
|
"new_name": new_name,
|
|
|
|
"user_name": user_name,
|
|
|
|
},
|
|
|
|
)
|
2021-07-09 18:49:16 +02:00
|
|
|
|
2024-06-03 16:38:39 +02:00
|
|
|
def uri_rm_passwd(uri):
|
|
|
|
return re.compile(r'(postgres://[^:]+:)([^@]+)(@)').sub(r'\1*****\3', uri)
|
2020-09-26 16:19:37 +02:00
|
|
|
|
|
|
|
def anonymize_db(cursor):
|
2021-07-09 18:49:16 +02:00
|
|
|
"""Traite, une à une, les colonnes indiquées dans ANONYMIZED_FIELDS"""
|
2020-09-26 16:19:37 +02:00
|
|
|
for tablecolumn in ANONYMIZED_FIELDS:
|
|
|
|
anonymize_column(cursor, tablecolumn)
|
|
|
|
|
2024-03-01 11:12:36 +01:00
|
|
|
if __name__ == "__main__":
|
|
|
|
PROCESS_USERS = False
|
|
|
|
if len(sys.argv) < 2 or len(sys.argv) > 3:
|
2023-02-09 12:29:46 +01:00
|
|
|
usage()
|
2024-03-01 11:12:36 +01:00
|
|
|
if len(sys.argv) > 2:
|
|
|
|
if sys.argv[1] != "--users":
|
|
|
|
usage()
|
2024-06-03 15:14:27 +02:00
|
|
|
dburi = sys.argv[2]
|
2024-03-01 11:12:36 +01:00
|
|
|
PROCESS_USERS = True
|
|
|
|
else:
|
2024-06-03 15:14:27 +02:00
|
|
|
dburi = sys.argv[1]
|
|
|
|
|
|
|
|
dbname = urllib.parse.urlparse(dburi).path.lstrip("/")
|
2024-03-01 11:12:36 +01:00
|
|
|
|
|
|
|
log(f"\nAnonymizing database {dbname}")
|
|
|
|
try:
|
2024-06-03 15:14:27 +02:00
|
|
|
cnx = psycopg2.connect(dburi)
|
2024-03-01 11:12:36 +01:00
|
|
|
except Exception as e:
|
|
|
|
log(f"\n*** Error: can't connect to database {dbname} ***\n")
|
2024-06-03 16:38:39 +02:00
|
|
|
log(f"""connexion uri was "{uri_rm_passwd(dburi)}" """)
|
2024-03-01 11:12:36 +01:00
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
cnx.set_session(autocommit=False)
|
|
|
|
cursor = cnx.cursor(cursor_factory=psycopg2.extras.DictCursor)
|
|
|
|
|
|
|
|
anonymize_db(cursor)
|
2024-04-11 01:45:25 +02:00
|
|
|
rename_students(cursor)
|
2024-03-01 11:12:36 +01:00
|
|
|
if PROCESS_USERS:
|
|
|
|
anonymize_users(cursor)
|
|
|
|
|
|
|
|
cnx.commit()
|
2024-06-03 16:38:39 +02:00
|
|
|
cnx.close()
|