From e40bf56fe9553eeea43fc4eff77fc8ef13fd3b2b Mon Sep 17 00:00:00 2001 From: William Bouchard <william.bouchard.2@umontreal.ca> Date: Sun, 30 Mar 2025 16:27:12 +0000 Subject: [PATCH] Ajout de KeywordURNVerif.ipynb --- jupyter/KeywordURNVerif.ipynb | 446 ++++++++++++++++++++++++++++++++++ 1 file changed, 446 insertions(+) create mode 100644 jupyter/KeywordURNVerif.ipynb diff --git a/jupyter/KeywordURNVerif.ipynb b/jupyter/KeywordURNVerif.ipynb new file mode 100644 index 0000000..cb0c144 --- /dev/null +++ b/jupyter/KeywordURNVerif.ipynb @@ -0,0 +1,446 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d8c7db35", + "metadata": {}, + "source": [ + "# KeywordURNVerif\n", + "## Script pour extraire les keywords de l'API qui n'ont pas d'URN\n", + "\n", + "Pour extraire tous les keywords: get_no_urn_keywords()\n", + "\n", + "Pour filtrer par catégorie: filter_by_category(get_no_urn_keywords(), *Catégorie*)\n", + "\n", + "Par défaut, les fonctions exportent les résultats en CSV, mais on peut désactiver l'export en ajoutant le paramètre \"csv_export = False\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "20d7b5dd", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "219b1ee7", + "metadata": {}, + "outputs": [], + "source": [ + "API_URL = \"https://anthologiagraeca.org/api/keywords/\" " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8cf7cb82", + "metadata": {}, + "outputs": [], + "source": [ + "#Fonction qui permet de récupérer tous les keywords depuis l'API dans un DataFrame\n", + "def get_api_data():\n", + " data = []\n", + " url = API_URL\n", + " \n", + " while url:\n", + " response = requests.get(url) \n", + " if response.status_code == 200:\n", + " page_data = response.json()\n", + " data.extend(page_data['results']) \n", + " url = page_data.get('next')\n", + " else:\n", + " print(f\"Erreur API : {response.status_code}\")\n", + " return pd.DataFrame() # Retourne un DataFrame vide en cas d'erreur\n", + " return pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a9cbe306", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction pour extraire le nom d'une liste de noms selon la priorité de la langue\n", + "def get_name(names):\n", + " #On préfère le français, puis l'anglais, etc.\n", + " priority_order = [\"fra\", \"eng\", \"grc\", \"ita\", \"lat\"]\n", + " \n", + " # Chercher le premier nom dans l'ordre de priorité\n", + " for name in names:\n", + " for language in priority_order:\n", + " if name[\"language\"] == language: return name[\"name\"]\n", + " \n", + " return None # Retourne None si aucun nom dans les langues prioritaires a été trouvé" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f851d63b", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction pour obtenir le nom en français à partir des noms de catégorie (il y a toujours une version en fr)\n", + "def get_category_name(category):\n", + " # Chercher dans la clé \"names\" pour obtenir le nom en français\n", + " for name in category.get(\"names\", []):\n", + " if name[\"language\"] == \"fra\":\n", + " return name[\"name\"]\n", + " return None # Si aucun nom en français n'est trouvé" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "379a5f10", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction pour extraire l'URN\n", + "def get_urn(alternative_urns):\n", + " if alternative_urns:\n", + " return alternative_urns[0].get(\"urn\", None)\n", + " return None # Retourne None si l'URN n'est pas présente" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "76fcd266", + "metadata": {}, + "outputs": [], + "source": [ + "# Retourne les keywords qui n'ont pas d'URN\n", + "def get_no_urn_keywords(csv_export = True):\n", + " # Cherche tous les keywords\n", + " keywords = get_api_data()\n", + " \n", + " #Simplifie leur structure et vérifie l'URN\n", + " keywords[\"category_name\"] = keywords[\"category\"].apply(get_category_name)\n", + " keywords[\"name\"] = keywords[\"names\"].apply(get_name)\n", + " keywords[\"alternative_urn\"] = keywords[\"alternative_urns\"].apply(get_urn)\n", + " \n", + " # On ne veut pas TOUT garder du keyword\n", + " columns_to_keep = ['id', 'url', 'category_name', 'name', 'alternative_urn']\n", + " keywords_filtered = keywords[columns_to_keep]\n", + " \n", + " # On garde seulement les keywords qui n'ont pas d'URN\n", + " keywords_filtered_no_urn = keywords_filtered[keywords_filtered[\"alternative_urn\"].isna()]\n", + " \n", + " # On exporte en CSV\n", + " if csv_export: keywords_filtered_no_urn.to_csv(\"keywords_no_urn.csv\", index=False, encoding=\"utf-8\")\n", + " \n", + " return keywords_filtered_no_urn" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a66ebdee", + "metadata": {}, + "outputs": [], + "source": [ + "# Retourne les keywords selon la catégorie demandée\n", + "def filter_by_category(keywords, category, csv_export = True):\n", + " \n", + " # Filtrer les lignes où la catégorie correspond\n", + " filtered_keywords = keywords[keywords[\"category_name\"]==category]\n", + " \n", + " #On exporte en CSV\n", + " if csv_export: filtered_keywords.to_csv(category+\"_no_urn.csv\", index=False, encoding=\"utf-8\")\n", + " \n", + " return filtered_keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ad7dd6ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>url</th>\n", + " <th>category_name</th>\n", + " <th>name</th>\n", + " <th>alternative_urn</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>35</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/35/</td>\n", + " <td>Poètes cités</td>\n", + " <td>Euphemus</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>44</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/44/</td>\n", + " <td>Poètes cités</td>\n", + " <td>Parthenis</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>46</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/46/</td>\n", + " <td>Poètes cités</td>\n", + " <td>Polyclitus</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>950</th>\n", + " <td>1139</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1139/</td>\n", + " <td>Poètes cités</td>\n", + " <td>Tellen</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id url category_name \\\n", + "34 35 https://anthologiagraeca.org/api/keywords/35/ Poètes cités \n", + "43 44 https://anthologiagraeca.org/api/keywords/44/ Poètes cités \n", + "45 46 https://anthologiagraeca.org/api/keywords/46/ Poètes cités \n", + "950 1139 https://anthologiagraeca.org/api/keywords/1139/ Poètes cités \n", + "\n", + " name alternative_urn \n", + "34 Euphemus None \n", + "43 Parthenis None \n", + "45 Polyclitus None \n", + "950 Tellen None " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Exemple d'utilisation\n", + "filter_by_category(get_no_urn_keywords(csv_export=False), \"Poètes cités\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d8a35789", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>url</th>\n", + " <th>category_name</th>\n", + " <th>name</th>\n", + " <th>alternative_urn</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/2/</td>\n", + " <td>Parcours de lecture</td>\n", + " <td>Méléagre amoureux</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/5/</td>\n", + " <td>Parcours de lecture</td>\n", + " <td>Serments d'amour</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/6/</td>\n", + " <td>Peuples, lieux</td>\n", + " <td>Megarians</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>20</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/20/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Diocles</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>23</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/23/</td>\n", + " <td>Motifs</td>\n", + " <td>Fleurs</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1083</th>\n", + " <td>1296</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1296/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Menestratos</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1085</th>\n", + " <td>1298</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1298/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Eumécius</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1086</th>\n", + " <td>1299</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1299/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Chairémon</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1087</th>\n", + " <td>1300</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1300/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Tityos</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1088</th>\n", + " <td>1301</td>\n", + " <td>https://anthologiagraeca.org/api/keywords/1301/</td>\n", + " <td>Personnes citées</td>\n", + " <td>Conon</td>\n", + " <td>None</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>706 rows × 5 columns</p>\n", + "</div>" + ], + "text/plain": [ + " id url \\\n", + "1 2 https://anthologiagraeca.org/api/keywords/2/ \n", + "4 5 https://anthologiagraeca.org/api/keywords/5/ \n", + "5 6 https://anthologiagraeca.org/api/keywords/6/ \n", + "19 20 https://anthologiagraeca.org/api/keywords/20/ \n", + "22 23 https://anthologiagraeca.org/api/keywords/23/ \n", + "... ... ... \n", + "1083 1296 https://anthologiagraeca.org/api/keywords/1296/ \n", + "1085 1298 https://anthologiagraeca.org/api/keywords/1298/ \n", + "1086 1299 https://anthologiagraeca.org/api/keywords/1299/ \n", + "1087 1300 https://anthologiagraeca.org/api/keywords/1300/ \n", + "1088 1301 https://anthologiagraeca.org/api/keywords/1301/ \n", + "\n", + " category_name name alternative_urn \n", + "1 Parcours de lecture Méléagre amoureux None \n", + "4 Parcours de lecture Serments d'amour None \n", + "5 Peuples, lieux Megarians None \n", + "19 Personnes citées Diocles None \n", + "22 Motifs Fleurs None \n", + "... ... ... ... \n", + "1083 Personnes citées Menestratos None \n", + "1085 Personnes citées Eumécius None \n", + "1086 Personnes citées Chairémon None \n", + "1087 Personnes citées Tityos None \n", + "1088 Personnes citées Conon None \n", + "\n", + "[706 rows x 5 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_no_urn_keywords()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab