diff --git a/WWW21-Information_Extraction_from_Co-Occurring_Similar_Entities.ipynb b/WWW21-Information_Extraction_from_Co-Occurring_Similar_Entities.ipynb index 49ce31f..ccb6edd 100644 --- a/WWW21-Information_Extraction_from_Co-Occurring_Similar_Entities.ipynb +++ b/WWW21-Information_Extraction_from_Co-Occurring_Similar_Entities.ipynb @@ -4,15 +4,2121 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Code and results of our TheWebConf 2021 paper `Information Extraction from Co-Occurring Similar Entities` will be published here very soon." + "# Information Extraction from Similar Co-Occurring Entities\n", + "This notebook contains the necessary code for reproducing the results of the paper \"Information Extraction from Similar Co-Occurring Entities\". The functionality for extracting new entities and facts for CaLiGraph is already implemented in the extraction framework. Here, we simply run this functionality to show the results and extend some parts to extract results for DBpedia as well.\n", + "\n", + "To reproduce the results, parts of the CaLiGraph extraction framework need to be run first. If necessary, the extraction will be triggered automatically from this notebook. A full run of this notebook needs roughly two days (plus three additional days when running the extraction framework). For hardware requirements, refer to the README.\n", + "\n", + "Evaluation results of the paper and all triples produced in this notebook can be downloaded [here](http://data.dws.informatik.uni-mannheim.de/CaLiGraph/www2021/).\n", + "\n", + "*Note: The results produced here differ slightly from the results of the paper as we use new Wikipedia and DBpedia dumps from 2020 instead of from 2016.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparations" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-03-03 16:50:32,375 DEBUG: RDFLib Version: 5.0.0\n" + ] + } + ], + "source": [ + "# imports\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "pd.options.display.max_rows = 50\n", + "pd.options.display.max_columns = 50\n", + "pd.options.display.max_colwidth = 100\n", + "\n", + "import impl.dbpedia.store as dbp_store\n", + "import impl.dbpedia.util as dbp_util\n", + "import impl.util.rdf as rdf_util\n", + "import impl.util.serialize as serialize_util\n", + "from impl import caligraph\n", + "import impl.caligraph.util as clg_util\n", + "from impl.listing import context, extract" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2021-03-03 20:15:32,126 INFO: Note: NumExpr detected 32 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", + "2021-03-03 20:15:32,130 INFO: NumExpr defaulting to 8 threads.\n" + ] + } + ], + "source": [ + "# load data\n", + "graph = caligraph.get_axiom_graph()\n", + "df = context.retrieve_page_entity_context(graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Type Extraction for *DBpedia*" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# compute valid combinations of types and NE tags\n", + "\n", + "def _get_transitive_types_dbp(entity_name: str) -> set:\n", + " dbp_uri = dbp_util.name2resource(str(entity_name))\n", + " transitive_types = dbp_store.get_transitive_types(dbp_uri).difference({rdf_util.CLASS_OWL_THING})\n", + " return {dbp_util.type2name(t) for t in transitive_types}\n", + "\n", + "def _get_valid_tags_for_entity_types_dbp(dft: pd.DataFrame, threshold: float) -> dict:\n", + " tag_probabilities = context._get_tag_probabilities(dft)\n", + " valid_tags = tag_probabilities[tag_probabilities['tag_fit'] >= threshold].groupby('E_enttype')['E_tag'].apply(lambda x: x.values.tolist()).to_dict()\n", + " for ent_type in set(valid_tags): # assign tags of parents to types without tags (to avoid inconsistencies)\n", + " valid_tags[ent_type] = _compute_valid_tags_for_type_dbp(ent_type, valid_tags)\n", + " return valid_tags\n", + "\n", + "def _compute_valid_tags_for_type_dbp(ent_type: str, valid_tags: dict) -> set:\n", + " if ent_type not in valid_tags:\n", + " return set()\n", + " if not valid_tags[ent_type]:\n", + " valid_tags[ent_type] = {tag for ptype in _get_supertypes_dbp(ent_type) for tag in _compute_valid_tags_for_type_dbp(ptype, valid_tags)}\n", + " return valid_tags[ent_type]\n", + "\n", + "def _get_supertypes_dbp(type_name: str) -> set:\n", + " dbp_type = dbp_util.name2type(str(type_name))\n", + " return {dbp_util.type2name(t) for t in dbp_store.get_supertypes(dbp_type).difference({rdf_util.CLASS_OWL_THING})}\n", + "\n", + "df_types_dbp = pd.DataFrame([{'E_ent': ent, 'E_enttype': t} for ent in df['E_ent'].unique() for t in _get_transitive_types_dbp(ent)])\n", + "dft_dbp = pd.merge(left=df, right=df_types_dbp, on='E_ent')\n", + "valid_tags_dbp = _get_valid_tags_for_entity_types_dbp(dft_dbp, .35)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# extract rules\n", + "\n", + "rule_dfs_dbp = {}\n", + "for rule_name, rule_pattern in extract.RULE_PATTERNS.items():\n", + " dft_by_page = extract._aggregate_types_by_page(dft_dbp, rule_pattern)\n", + " rule_dfs_dbp[rule_name] = extract._aggregate_types_by_section(dft_by_page, rule_pattern)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Threshold Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# extract all possible new types up to a very low threshold (mean >= 0.25 and std <= 0.75)\n", + "\n", + "threshold_rule_dfs_dbp = [rule_dfs_dbp[rule_name].query(f'micro_mean > .25 & micro_std < .75').reset_index()[rule_pattern + ['E_enttype', 'micro_mean', 'micro_std']].drop_duplicates() for rule_name, rule_pattern in extract.RULE_PATTERNS.items()]\n", + "threshold_types_dbp = extract._extract_new_types(threshold_rule_dfs_dbp, df, df_types_dbp)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def visualise_type_thresholds(df, tag_probabilities, bin_count):\n", + " \"\"\"Visualize the tag fit in relation to the confidence and consistency (here: std).\"\"\"\n", + " df = pd.merge(df, tag_probabilities, on=['E_enttype', 'E_tag'])\n", + " \n", + " min_mean = .25\n", + " mean_conf_labels = [min_mean + x / bin_count * (1 - min_mean) for x in range(bin_count)]\n", + " df['mean_confidence'] = pd.cut(df['micro_mean'], bin_count, labels=mean_conf_labels)\n", + " \n", + " # per bin\n", + " plt.bar(x=mean_conf_labels, height=df.groupby('mean_confidence')['tag_fit'].mean().values, width=(1-min_mean)/bin_count*0.8, align='edge', label='tag_fit')\n", + " # cumulative\n", + " cumulative_tag_fit = [df[df['micro_mean'] > x]['tag_fit'].mean() for x in mean_conf_labels]\n", + " plt.plot(mean_conf_labels, cumulative_tag_fit, 'ro--', label='tag_fit (cum)')\n", + " plt.xlabel('Pattern Confidence')\n", + " plt.title('Pattern Confidence vs. Tagger Confidence')\n", + " plt.legend()\n", + " plt.show()\n", + "\n", + " max_std = .75\n", + " std_conf_labels = [(x+1) / bin_count * max_std for x in range(bin_count)]\n", + " df['std_confidence'] = pd.cut(df['micro_std'], bin_count, labels=std_conf_labels)\n", + "\n", + " # per bin\n", + " plt.bar(x=std_conf_labels, height=df.groupby('std_confidence')['tag_fit'].mean().values, width=max_std/bin_count*-0.8, align='edge', label='tag_fit')\n", + " # cumulative\n", + " cumulative_tag_fit = [df[df['micro_std'] < x]['tag_fit'].mean() for x in std_conf_labels]\n", + " plt.plot(std_conf_labels, cumulative_tag_fit, 'ro--', label='tag_fit (cum)')\n", + " plt.xlabel('Pattern Std.Dev.')\n", + " plt.title('Pattern Std.Dev. vs. Tagger Confidence')\n", + " plt.legend()\n", + " plt.show()\n", + " \n", + "visualise_type_thresholds(threshold_types_dbp, context._get_tag_probabilities(dft_dbp), 15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# filter rules by threshold and compute filtered types\n", + "\n", + "new_types_dbp = extract._extract_new_types_with_threshold(df, df_types_dbp, rule_dfs_dbp)\n", + "filtered_types_dbp = extract._filter_new_types_by_tag(new_types_dbp, valid_tags_dbp)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
P_basetypeTS_textE_enttypemicro_meanmicro_stdPE_entE_textE_tagTS_entS_textS_entE_idTS_enttypeS_enttypeP_type
29480310WorkPersonnelPerson0.9731510.045563Vanished GardensVanished Gardens--Bernie GrundmanBernie GrundmanPERSONNonePersonnelNone39380735.0NaNNaNAlbum
18362792OtherPeoplePerson0.9902740.019795MenasceDaniel MenasceDaniel MenascePERSONNonePeopleNone38921168.0NaNNaNOther
20987372OtherSpeciesEukaryote0.9913290.015792CoelotesCoelotes striatilamnisCoelotes s. ketmenensisOTHERNoneSpeciesNone33212227.0NaNNaNOther
31684226OrganisationFirst World WarOrganisation1.0000000.0000006th (United Kingdom) Division6th (United Kingdom) Division--9th (Service)9th (Service)ORGWorld War ILater in the WarNone4250374.0MilitaryConflictNaNMilitaryUnit
29948623WorkTrack listingMusicalWork0.9431740.103822A Picture of Me (Without You)A Picture of Me (Without You)--She Loves Me (Right Out of My Mind)She Loves Me (Right Out of My Mind)WORK_OF_ARTNoneTrack listingNone16758878.0NaNNaNAlbum
9052004LocationStoresCompany0.9834710.023047Eastport PlazaEastport Plaza--Nail TekNail TekORGNoneStoresNone30609099.0NaNNaNShoppingMall
28487131WorkPersonnelArtist0.8788500.140657SimetriSitiSimetriSiti--Jay FrancoJay FrancoPERSONNonePersonnelNone37829447.0NaNNaNAlbum
34360607OtherRolling stockMeanOfTransportation1.0000000.000000KamomeKamome--485 series EMUs485 series EMUsPRODUCTNoneRolling stock used in the pastNone16899907.0NaNNaNOther
21119704OtherSpeciesSpecies0.9996080.002107GageaGagea hiensisGagea hiensisOTHERNoneSpeciesNone7517175.0NaNNaNOther
2532341ListChampions by yearAthlete0.9818840.112793List of Grand Slam men's singles championsAshley Cooper (tennis player)Ashley CooperPERSONNoneChampions by yearNone8457320.0NaNNaNList
28815210WorkPersonnelMusicalArtist0.8717310.145537Loud 'n' RawLoud 'n' Raw--Hirotsugu HommaHirotsugu HommaPERSONNonePersonnelNone25800546.0NaNNaNAlbum
30777644WorkTrack listingSong0.8887730.173072Strange BluesStrange BluesStrange BluesWORK_OF_ARTNoneTrack listingNone32289269.0NaNNaNAlbum
3295536ListForza ItaliaPerson1.0000000.000000List of members of the Italian Senate, 1996–2001Dino De AnnaDino De AnnaPERSONForza ItaliaForza ItaliaForza Italia33607227.0PoliticalPartyPoliticalPartyList
16237881OtherLine of SuccessionAgent1.0000000.000000Earl of LonsdaleEarl of Lonsdale--Nikolas LowtherNikolas LowtherPERSONNoneLine of SuccessionNone3832509.0NaNNaNOther
20650923OtherSpeciesAnimal0.9342130.151303NeonoemacheilusNeonoemacheilus morehensisNeonoemacheilus morehensisSPECIESNoneSpeciesNone29062922.0NaNNaNOther
9728401OrganisationAffiliated artistsPerson0.8571430.000000301Studios301Studios--Max HabenMax HabenPERSONNoneAffiliated artistsNone20434515.0NaNNaNRecordLabel
28364375WorkPersonnelArtist0.8788500.140657BatbabyBatbaby--Amy LutherAmy LutherPERSONNonePersonnelNone29141603.0NaNNaNAlbum
9212341LocationTowns and villages in the districtPopulatedPlace1.0000000.000000Kottayam districtKottayam district--MudiyoorkaraMudiyoorkaraGPENoneOther villagesNone690614.0NaNNaNAdministrativeRegion
23163889PersonFeature filmsWork0.9954340.005732Peggy McCayPeggy McCay--Daddy's GirlDaddy's GirlWORK_OF_ARTNoneFeature filmsNone7372097.0NaNNaNPerson
31266144WorkTrack listingWork0.9493920.093618Regresará Por MíRegresará Por Mí--Tu MisericordiaTu MisericordiaWORK_OF_ARTNoneTrack listingNone27737421.0NaNNaNAlbum
\n", + "
" + ], + "text/plain": [ + " P_basetype TS_text \\\n", + "29480310 Work Personnel \n", + "18362792 Other People \n", + "20987372 Other Species \n", + "31684226 Organisation First World War \n", + "29948623 Work Track listing \n", + "9052004 Location Stores \n", + "28487131 Work Personnel \n", + "34360607 Other Rolling stock \n", + "21119704 Other Species \n", + "2532341 List Champions by year \n", + "28815210 Work Personnel \n", + "30777644 Work Track listing \n", + "3295536 List Forza Italia \n", + "16237881 Other Line of Succession \n", + "20650923 Other Species \n", + "9728401 Organisation Affiliated artists \n", + "28364375 Work Personnel \n", + "9212341 Location Towns and villages in the district \n", + "23163889 Person Feature films \n", + "31266144 Work Track listing \n", + "\n", + " E_enttype micro_mean micro_std \\\n", + "29480310 Person 0.973151 0.045563 \n", + "18362792 Person 0.990274 0.019795 \n", + "20987372 Eukaryote 0.991329 0.015792 \n", + "31684226 Organisation 1.000000 0.000000 \n", + "29948623 MusicalWork 0.943174 0.103822 \n", + "9052004 Company 0.983471 0.023047 \n", + "28487131 Artist 0.878850 0.140657 \n", + "34360607 MeanOfTransportation 1.000000 0.000000 \n", + "21119704 Species 0.999608 0.002107 \n", + "2532341 Athlete 0.981884 0.112793 \n", + "28815210 MusicalArtist 0.871731 0.145537 \n", + "30777644 Song 0.888773 0.173072 \n", + "3295536 Person 1.000000 0.000000 \n", + "16237881 Agent 1.000000 0.000000 \n", + "20650923 Animal 0.934213 0.151303 \n", + "9728401 Person 0.857143 0.000000 \n", + "28364375 Artist 0.878850 0.140657 \n", + "9212341 PopulatedPlace 1.000000 0.000000 \n", + "23163889 Work 0.995434 0.005732 \n", + "31266144 Work 0.949392 0.093618 \n", + "\n", + " P \\\n", + "29480310 Vanished Gardens \n", + "18362792 Menasce \n", + "20987372 Coelotes \n", + "31684226 6th (United Kingdom) Division \n", + "29948623 A Picture of Me (Without You) \n", + "9052004 Eastport Plaza \n", + "28487131 SimetriSiti \n", + "34360607 Kamome \n", + "21119704 Gagea \n", + "2532341 List of Grand Slam men's singles champions \n", + "28815210 Loud 'n' Raw \n", + "30777644 Strange Blues \n", + "3295536 List of members of the Italian Senate, 1996–2001 \n", + "16237881 Earl of Lonsdale \n", + "20650923 Neonoemacheilus \n", + "9728401 301Studios \n", + "28364375 Batbaby \n", + "9212341 Kottayam district \n", + "23163889 Peggy McCay \n", + "31266144 Regresará Por Mí \n", + "\n", + " E_ent \\\n", + "29480310 Vanished Gardens--Bernie Grundman \n", + "18362792 Daniel Menasce \n", + "20987372 Coelotes striatilamnis \n", + "31684226 6th (United Kingdom) Division--9th (Service) \n", + "29948623 A Picture of Me (Without You)--She Loves Me (Right Out of My Mind) \n", + "9052004 Eastport Plaza--Nail Tek \n", + "28487131 SimetriSiti--Jay Franco \n", + "34360607 Kamome--485 series EMUs \n", + "21119704 Gagea hiensis \n", + "2532341 Ashley Cooper (tennis player) \n", + "28815210 Loud 'n' Raw--Hirotsugu Homma \n", + "30777644 Strange Blues \n", + "3295536 Dino De Anna \n", + "16237881 Earl of Lonsdale--Nikolas Lowther \n", + "20650923 Neonoemacheilus morehensis \n", + "9728401 301Studios--Max Haben \n", + "28364375 Batbaby--Amy Luther \n", + "9212341 Kottayam district--Mudiyoorkara \n", + "23163889 Peggy McCay--Daddy's Girl \n", + "31266144 Regresará Por Mí--Tu Misericordia \n", + "\n", + " E_text E_tag TS_ent \\\n", + "29480310 Bernie Grundman PERSON None \n", + "18362792 Daniel Menasce PERSON None \n", + "20987372 Coelotes s. ketmenensis OTHER None \n", + "31684226 9th (Service) ORG World War I \n", + "29948623 She Loves Me (Right Out of My Mind) WORK_OF_ART None \n", + "9052004 Nail Tek ORG None \n", + "28487131 Jay Franco PERSON None \n", + "34360607 485 series EMUs PRODUCT None \n", + "21119704 Gagea hiensis OTHER None \n", + "2532341 Ashley Cooper PERSON None \n", + "28815210 Hirotsugu Homma PERSON None \n", + "30777644 Strange Blues WORK_OF_ART None \n", + "3295536 Dino De Anna PERSON Forza Italia \n", + "16237881 Nikolas Lowther PERSON None \n", + "20650923 Neonoemacheilus morehensis SPECIES None \n", + "9728401 Max Haben PERSON None \n", + "28364375 Amy Luther PERSON None \n", + "9212341 Mudiyoorkara GPE None \n", + "23163889 Daddy's Girl WORK_OF_ART None \n", + "31266144 Tu Misericordia WORK_OF_ART None \n", + "\n", + " S_text S_ent E_id \\\n", + "29480310 Personnel None 39380735.0 \n", + "18362792 People None 38921168.0 \n", + "20987372 Species None 33212227.0 \n", + "31684226 Later in the War None 4250374.0 \n", + "29948623 Track listing None 16758878.0 \n", + "9052004 Stores None 30609099.0 \n", + "28487131 Personnel None 37829447.0 \n", + "34360607 Rolling stock used in the past None 16899907.0 \n", + "21119704 Species None 7517175.0 \n", + "2532341 Champions by year None 8457320.0 \n", + "28815210 Personnel None 25800546.0 \n", + "30777644 Track listing None 32289269.0 \n", + "3295536 Forza Italia Forza Italia 33607227.0 \n", + "16237881 Line of Succession None 3832509.0 \n", + "20650923 Species None 29062922.0 \n", + "9728401 Affiliated artists None 20434515.0 \n", + "28364375 Personnel None 29141603.0 \n", + "9212341 Other villages None 690614.0 \n", + "23163889 Feature films None 7372097.0 \n", + "31266144 Track listing None 27737421.0 \n", + "\n", + " TS_enttype S_enttype P_type \n", + "29480310 NaN NaN Album \n", + "18362792 NaN NaN Other \n", + "20987372 NaN NaN Other \n", + "31684226 MilitaryConflict NaN MilitaryUnit \n", + "29948623 NaN NaN Album \n", + "9052004 NaN NaN ShoppingMall \n", + "28487131 NaN NaN Album \n", + "34360607 NaN NaN Other \n", + "21119704 NaN NaN Other \n", + "2532341 NaN NaN List \n", + "28815210 NaN NaN Album \n", + "30777644 NaN NaN Album \n", + "3295536 PoliticalParty PoliticalParty List \n", + "16237881 NaN NaN Other \n", + "20650923 NaN NaN Other \n", + "9728401 NaN NaN RecordLabel \n", + "28364375 NaN NaN Album \n", + "9212341 NaN NaN AdministrativeRegion \n", + "23163889 NaN NaN Person \n", + "31266144 NaN NaN Album " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_types_dbp.sample(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
E_textE_enttype
14654325Nicolae NegumereanuMartialArtist
29470332ErgunPerson
14416762Nilkanth Singh MundaPolitician
21230683Medetera valaisensisSpecies
24549197Daïnah la métisseWork
29966481Steady RainMusicalWork
15827202Harry WeisbergAgent
30065800Slide ShowMusicalWork
26804418Amelia WiśniewskaAgent
29313304Joshua Levy-Person
32285183Maeght FoundationArchitecturalStructure
212625Mars MatrixVideoGame
28654942Chris LawsMusicalArtist
35334405The TigressFilm
29280036A. J. BensonPerson
3270801Art DeCarloGridironFootballPlayer
3806127KENH-LDBroadcaster
20592007Euclystis golosusAnimal
909124Alina GrachevaPerson
14001821LeadbetterSnookerPlayer
\n", + "
" + ], + "text/plain": [ + " E_text E_enttype\n", + "14654325 Nicolae Negumereanu MartialArtist\n", + "29470332 Ergun Person\n", + "14416762 Nilkanth Singh Munda Politician\n", + "21230683 Medetera valaisensis Species\n", + "24549197 Daïnah la métisse Work\n", + "29966481 Steady Rain MusicalWork\n", + "15827202 Harry Weisberg Agent\n", + "30065800 Slide Show MusicalWork\n", + "26804418 Amelia Wiśniewska Agent\n", + "29313304 Joshua Levy- Person\n", + "32285183 Maeght Foundation ArchitecturalStructure\n", + "212625 Mars Matrix VideoGame\n", + "28654942 Chris Laws MusicalArtist\n", + "35334405 The Tigress Film\n", + "29280036 A. J. Benson Person\n", + "3270801 Art DeCarlo GridironFootballPlayer\n", + "3806127 KENH-LD Broadcaster\n", + "20592007 Euclystis golosus Animal\n", + "909124 Alina Gracheva Person\n", + "14001821 Leadbetter SnookerPlayer" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_types_dbp.sample(20)[['E_text', 'E_enttype']]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# write new types to file\n", + "def serialize_types_dbp(df, filename):\n", + " triples = [serialize_util.as_object_triple(dbp_util.name2resource(r['E_ent']), rdf_util.PREDICATE_TYPE, dbp_util.name2type(r['E_enttype'])) for _, r in df.iterrows()]\n", + " with open(filename, mode='w') as f:\n", + " f.writelines(triples)\n", + "\n", + "serialize_types_dbp(filtered_types_dbp, 'new_types_dbpedia.nt')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Type Extraction for *CaLiGraph*" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# compute valid combinations of types and NE tags\n", + "\n", + "df_types_clg = context.get_entity_types(df, graph)\n", + "dft_clg = pd.merge(left=df, right=df_types_clg, on='E_ent')\n", + "valid_tags_clg = context.get_valid_tags_for_entity_types(dft_clg, graph, .35)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# extract rules\n", + "\n", + "rule_dfs_clg = {}\n", + "for rule_name, rule_pattern in extract.RULE_PATTERNS.items():\n", + " dft_by_page = extract._aggregate_types_by_page(dft_clg, rule_pattern)\n", + " rule_dfs_clg[rule_name] = extract._aggregate_types_by_section(dft_by_page, rule_pattern)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# filter rules by threshold and compute filtered types\n", + "\n", + "new_types_clg = extract._extract_new_types_with_threshold(df, df_types_clg, rule_dfs_clg)\n", + "filtered_types_clg = extract._filter_new_types_by_tag(new_types_clg, valid_tags_clg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# write new types to file\n", + "\n", + "def serialize_types_clg(df, filename):\n", + " triples = [serialize_util.as_object_triple(clg_util.name2clg_resource(r['E_ent']), rdf_util.PREDICATE_TYPE, clg_util.name2clg_type(r['E_enttype'])) for _, r in df.iterrows()]\n", + " with open(filename, mode='w') as f:\n", + " f.writelines(triples)\n", + "\n", + "serialize_types_clg(filtered_types_clg, 'new_types_caligraph.nt')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Relation Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve existing relations\n", + "\n", + "df_rels = context.get_entity_relations()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Threshold Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# extract all possible rules for relations with PageEntity as target\n", + "\n", + "rule_dfs = {}\n", + "dfr = extract._create_relation_df(df, df_rels, 'P')\n", + "dfr_types = extract._create_relation_type_df(dfr)\n", + "for rule_name, rule_pattern in extract.RULE_PATTERNS.items():\n", + " dfr_by_page = extract._aggregate_relations_by_page(df, dfr, df_rels, rule_pattern)\n", + " rule_dfs[rule_name] = extract._aggregate_relations_by_section(dfr_by_page, rule_pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# generate all possible relations up to a very low threshold (mean > 0.25 and std < 0.75)\n", + "threshold_rule_dfs = [rule_dfs[rule_name].query(f'page_count > 2 & micro_mean > .25 & micro_std < .75').reset_index()[rule_pattern + ['rel', 'micro_mean', 'micro_std']].drop_duplicates() for rule_name, rule_pattern in extract.RULE_PATTERNS.items()]\n", + "threshold_relations = extract._extract_new_relations(threshold_rule_dfs, 'P', df, dfr_types, df_rels)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# visualise thresholds\n", + "\n", + "def visualise_relation_thresholds(df, target, tag_probabilities, bin_count): \n", + " \"\"\"Visualize the tag fit in relation to the confidence and consistency (here: std).\"\"\"\n", + " # add tag_fit\n", + " df = pd.merge(df, tag_probabilities, left_on=['E_predtype', 'E_tag'], right_on=['E_enttype', 'E_tag'])\n", + " \n", + " min_mean = .25\n", + " mean_conf_labels = [min_mean + x / bin_count * (1 - min_mean) for x in range(bin_count)]\n", + " df['mean_confidence'] = pd.cut(df['micro_mean'], bin_count, labels=mean_conf_labels)\n", + " max_std = .75\n", + " std_conf_labels = [(x+1) / bin_count * max_std for x in range(bin_count)]\n", + " df['std_confidence'] = pd.cut(df['micro_std'], bin_count, labels=std_conf_labels)\n", + " \n", + " ## tag_fit with confidence ##\n", + " # per bin\n", + " plt.bar(x=mean_conf_labels, height=df.groupby('mean_confidence')['tag_fit'].mean().values, width=(1-min_mean)/bin_count*0.8, align='edge', label='tag_fit')\n", + " # cumulative\n", + " cumulative_tag_fit = [df[df['micro_mean'] > x]['tag_fit'].mean() for x in mean_conf_labels]\n", + " plt.plot(mean_conf_labels, cumulative_tag_fit, 'ro--', label='tag_fit (cum)')\n", + " plt.xlabel('Pattern Confidence')\n", + " plt.title('Pattern Confidence vs. Tagger Confidence')\n", + " plt.legend()\n", + " plt.show()\n", + " \n", + " ## tag_fit with consistency ##\n", + " # per bin\n", + " plt.bar(x=std_conf_labels, height=df.groupby('std_confidence')['tag_fit'].mean().values, width=max_std/bin_count*-0.8, align='edge', label='tag_fit')\n", + " # cumulative\n", + " cumulative_tag_fit = [df[df['micro_std'] < x]['tag_fit'].mean() for x in std_conf_labels]\n", + " plt.plot(std_conf_labels, cumulative_tag_fit, 'ro--', label='tag_fit (cum)')\n", + " plt.xlabel('Pattern Std.Dev.')\n", + " plt.title('Pattern Std.Dev. vs. Tagger Confidence')\n", + " plt.legend()\n", + " plt.show()\n", + " \n", + "visualise_relation_thresholds(threshold_relations, 'P', context._get_tag_probabilities(dft_dbp), 15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Results" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve new relations for all three targets\n", + "df_new_relations = extract._compute_new_relations(df, df_rels, 'P', valid_tags_dbp)\n", + "df_new_relations = pd.concat([df_new_relations, extract._compute_new_relations(df, df_rels, 'TS_ent', valid_tags_dbp)])\n", + "df_new_relations = pd.concat([df_new_relations, extract._compute_new_relations(df, df_rels, 'S_ent', valid_tags_dbp)])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
P_basetypeTS_textrelmicro_meanmicro_stdPE_entE_textE_tagTS_entS_textS_entE_idTS_enttypeS_enttypeP_typepredinvE_predtypetarget
1804314OtherSpecies< genus <0.9770190.045419HalimuraenaHalimuraena shakaiHalimuraena shakaiSPECIESNoneSpeciesNone39648306NaNNaNOtherhttp://dbpedia.org/ontology/genusTrueSpeciesHalimuraena
216985LocationCommunities< subdivision <0.9573670.098188Christian County, MissouriChadwick, MissouriChadwickGPENoneOther unincorporated communitiesNone1353294NaNNaNAdministrativeRegionhttp://dbpedia.org/ontology/subdivisionTruePopulatedPlaceChristian County, Missouri
107055ListOdonata< order <1.0000000.000000List of least concern insectsPinheyschna subpupillataStream hawkerSPECIESOdonataAeshnidsAeshnidae36319656NaNNaNListhttp://dbpedia.org/ontology/orderTrueSpeciesOdonata
1650594OtherSpecies< genus <0.9770190.045419EupoeciliaEupoecilia kruegerianaEupoecilia kruegerianaSPECIESNoneSpeciesNone25698774NaNNaNOtherhttp://dbpedia.org/ontology/genusTrueSpeciesEupoecilia
61042ListGastropods< family <1.0000000.000000List of vulnerable molluscsBrotia wykoffiBrotia wykoffiSPECIESGastropodaPachychilidsPachychilidae36300492NaNNaNListhttp://dbpedia.org/ontology/familyTrueSpeciesPachychilidae
1732647OtherSpecies< genus <0.9770190.045419LeptonetaLeptoneta maculosaL. maculosaOTHERNoneSpeciesNone37360208NaNNaNOtherhttp://dbpedia.org/ontology/genusTrueSpeciesLeptoneta
2084439SpeciesSpecies< genus <0.9840590.029726Triphora (gastropod)Triphora dalliTriphora dalliSPECIESSpeciesSpeciesSpecies32364528NaNNaNMolluscahttp://dbpedia.org/ontology/genusTrueSpeciesTriphora (gastropod)
654105PersonDirecting filmography< director <1.0000000.000000Jonathan FrakesJonathan Frakes--All or NothingAll or NothingWORK_OF_ARTNoneTelevisionNone1103264NaNNaNPersonhttp://dbpedia.org/ontology/directorTrueFilmJonathan Frakes
806033PersonFilm scores< musicComposer <0.8732880.154829Don PrestonThe Underachievers (film)The UnderachieversWORK_OF_ARTNoneFilm scoresNone12602413NaNNaNMusicalArtisthttp://dbpedia.org/ontology/musicComposerTrueWorkDon Preston
611381OrganisationSingles< artist <0.9032260.130987La FactoríaLa Factoría--AmigaAmigaWORK_OF_ARTNoneSinglesNone21081955NaNNaNBandhttp://dbpedia.org/ontology/artistTrueMusicalWorkLa Factoría
1509909OtherSpecies< genus <0.9770190.045419Mouse-eared batMyotis oxyotusMyotis oxyotusOTHERNoneSpeciesNone8796526NaNNaNOtherhttp://dbpedia.org/ontology/genusTrueSpeciesMouse-eared bat
1226829SpeciesMain< genus <0.9851820.098228ApristusApristus striatusApristus striatusOTHERNoneMainNone28762634NaNNaNInsecthttp://dbpedia.org/ontology/genusTrueSpeciesApristus
385490LocationCommunities< subdivision <0.9808440.034299Arkansas County, ArkansasNady, ArkansasNadyGPENoneUnincorporated communitiesNone1240891NaNNaNAdministrativeRegionhttp://dbpedia.org/ontology/subdivisionTruePopulatedPlaceArkansas County, Arkansas
795111PersonWorks< author <0.9583950.199014Krishna SobtiKrishna Sobti--BadalonBadalonWORK_OF_ARTNoneNovelsNone20960058NaNNaNWriterhttp://dbpedia.org/ontology/authorTrueWorkKrishna Sobti
63260ListInsects< family <1.0000000.000000List of least concern arthropodsMicrathyria caerulistylaBlue-tipped dasherSPECIESInsectLibellulidsLibellulidae36130044NaNNaNListhttp://dbpedia.org/ontology/familyTrueSpeciesLibellulidae
285621WorkPersonnel< associatedBand <1.0000000.000000The Real Thing (Midnight Oil album)Bones HillmanBones HillmanPERSONNoneMidnight OilMidnight Oil10413938NaNAgentAlbumhttp://dbpedia.org/ontology/associatedBandTrueAgentMidnight Oil
61849ListMolluscs< phylum <0.9076920.043341List of endangered invertebratesHadopyrgus brevisHadopyrgus brevisSPECIESMolluscaHydrobiidsHydrobiidae36240302NaNNaNListhttp://dbpedia.org/ontology/phylumTrueSpeciesMollusca
20940PersonCareer statistics> team >0.9314800.106772Joe LedleyJoe Ledley--NewcastleNewcastleORGNoneCareer statisticsNone10472566NaNNaNSoccerPlayerhttp://dbpedia.org/ontology/teamFalseSportsTeamJoe Ledley
688946PersonPartial filmography< director <0.9223600.164080George Hill (director)Min and BillMin and BillWORK_OF_ARTNoneDirectorNone9569050NaNNaNPersonhttp://dbpedia.org/ontology/directorTrueFilmGeorge Hill (director)
1480502OtherList of species< genus <0.9878050.021681CybisterCybister crassipesCybister crassipesOTHERNoneList of speciesNone24520612NaNNaNOtherhttp://dbpedia.org/ontology/genusTrueSpeciesCybister
\n", + "
" + ], + "text/plain": [ + " P_basetype TS_text rel micro_mean \\\n", + "1804314 Other Species < genus < 0.977019 \n", + "216985 Location Communities < subdivision < 0.957367 \n", + "107055 List Odonata < order < 1.000000 \n", + "1650594 Other Species < genus < 0.977019 \n", + "61042 List Gastropods < family < 1.000000 \n", + "1732647 Other Species < genus < 0.977019 \n", + "2084439 Species Species < genus < 0.984059 \n", + "654105 Person Directing filmography < director < 1.000000 \n", + "806033 Person Film scores < musicComposer < 0.873288 \n", + "611381 Organisation Singles < artist < 0.903226 \n", + "1509909 Other Species < genus < 0.977019 \n", + "1226829 Species Main < genus < 0.985182 \n", + "385490 Location Communities < subdivision < 0.980844 \n", + "795111 Person Works < author < 0.958395 \n", + "63260 List Insects < family < 1.000000 \n", + "285621 Work Personnel < associatedBand < 1.000000 \n", + "61849 List Molluscs < phylum < 0.907692 \n", + "20940 Person Career statistics > team > 0.931480 \n", + "688946 Person Partial filmography < director < 0.922360 \n", + "1480502 Other List of species < genus < 0.987805 \n", + "\n", + " micro_std P \\\n", + "1804314 0.045419 Halimuraena \n", + "216985 0.098188 Christian County, Missouri \n", + "107055 0.000000 List of least concern insects \n", + "1650594 0.045419 Eupoecilia \n", + "61042 0.000000 List of vulnerable molluscs \n", + "1732647 0.045419 Leptoneta \n", + "2084439 0.029726 Triphora (gastropod) \n", + "654105 0.000000 Jonathan Frakes \n", + "806033 0.154829 Don Preston \n", + "611381 0.130987 La Factoría \n", + "1509909 0.045419 Mouse-eared bat \n", + "1226829 0.098228 Apristus \n", + "385490 0.034299 Arkansas County, Arkansas \n", + "795111 0.199014 Krishna Sobti \n", + "63260 0.000000 List of least concern arthropods \n", + "285621 0.000000 The Real Thing (Midnight Oil album) \n", + "61849 0.043341 List of endangered invertebrates \n", + "20940 0.106772 Joe Ledley \n", + "688946 0.164080 George Hill (director) \n", + "1480502 0.021681 Cybister \n", + "\n", + " E_ent E_text E_tag \\\n", + "1804314 Halimuraena shakai Halimuraena shakai SPECIES \n", + "216985 Chadwick, Missouri Chadwick GPE \n", + "107055 Pinheyschna subpupillata Stream hawker SPECIES \n", + "1650594 Eupoecilia kruegeriana Eupoecilia kruegeriana SPECIES \n", + "61042 Brotia wykoffi Brotia wykoffi SPECIES \n", + "1732647 Leptoneta maculosa L. maculosa OTHER \n", + "2084439 Triphora dalli Triphora dalli SPECIES \n", + "654105 Jonathan Frakes--All or Nothing All or Nothing WORK_OF_ART \n", + "806033 The Underachievers (film) The Underachievers WORK_OF_ART \n", + "611381 La Factoría--Amiga Amiga WORK_OF_ART \n", + "1509909 Myotis oxyotus Myotis oxyotus OTHER \n", + "1226829 Apristus striatus Apristus striatus OTHER \n", + "385490 Nady, Arkansas Nady GPE \n", + "795111 Krishna Sobti--Badalon Badalon WORK_OF_ART \n", + "63260 Micrathyria caerulistyla Blue-tipped dasher SPECIES \n", + "285621 Bones Hillman Bones Hillman PERSON \n", + "61849 Hadopyrgus brevis Hadopyrgus brevis SPECIES \n", + "20940 Joe Ledley--Newcastle Newcastle ORG \n", + "688946 Min and Bill Min and Bill WORK_OF_ART \n", + "1480502 Cybister crassipes Cybister crassipes OTHER \n", + "\n", + " TS_ent S_text S_ent \\\n", + "1804314 None Species None \n", + "216985 None Other unincorporated communities None \n", + "107055 Odonata Aeshnids Aeshnidae \n", + "1650594 None Species None \n", + "61042 Gastropoda Pachychilids Pachychilidae \n", + "1732647 None Species None \n", + "2084439 Species Species Species \n", + "654105 None Television None \n", + "806033 None Film scores None \n", + "611381 None Singles None \n", + "1509909 None Species None \n", + "1226829 None Main None \n", + "385490 None Unincorporated communities None \n", + "795111 None Novels None \n", + "63260 Insect Libellulids Libellulidae \n", + "285621 None Midnight Oil Midnight Oil \n", + "61849 Mollusca Hydrobiids Hydrobiidae \n", + "20940 None Career statistics None \n", + "688946 None Director None \n", + "1480502 None List of species None \n", + "\n", + " E_id TS_enttype S_enttype P_type \\\n", + "1804314 39648306 NaN NaN Other \n", + "216985 1353294 NaN NaN AdministrativeRegion \n", + "107055 36319656 NaN NaN List \n", + "1650594 25698774 NaN NaN Other \n", + "61042 36300492 NaN NaN List \n", + "1732647 37360208 NaN NaN Other \n", + "2084439 32364528 NaN NaN Mollusca \n", + "654105 1103264 NaN NaN Person \n", + "806033 12602413 NaN NaN MusicalArtist \n", + "611381 21081955 NaN NaN Band \n", + "1509909 8796526 NaN NaN Other \n", + "1226829 28762634 NaN NaN Insect \n", + "385490 1240891 NaN NaN AdministrativeRegion \n", + "795111 20960058 NaN NaN Writer \n", + "63260 36130044 NaN NaN List \n", + "285621 10413938 NaN Agent Album \n", + "61849 36240302 NaN NaN List \n", + "20940 10472566 NaN NaN SoccerPlayer \n", + "688946 9569050 NaN NaN Person \n", + "1480502 24520612 NaN NaN Other \n", + "\n", + " pred inv E_predtype \\\n", + "1804314 http://dbpedia.org/ontology/genus True Species \n", + "216985 http://dbpedia.org/ontology/subdivision True PopulatedPlace \n", + "107055 http://dbpedia.org/ontology/order True Species \n", + "1650594 http://dbpedia.org/ontology/genus True Species \n", + "61042 http://dbpedia.org/ontology/family True Species \n", + "1732647 http://dbpedia.org/ontology/genus True Species \n", + "2084439 http://dbpedia.org/ontology/genus True Species \n", + "654105 http://dbpedia.org/ontology/director True Film \n", + "806033 http://dbpedia.org/ontology/musicComposer True Work \n", + "611381 http://dbpedia.org/ontology/artist True MusicalWork \n", + "1509909 http://dbpedia.org/ontology/genus True Species \n", + "1226829 http://dbpedia.org/ontology/genus True Species \n", + "385490 http://dbpedia.org/ontology/subdivision True PopulatedPlace \n", + "795111 http://dbpedia.org/ontology/author True Work \n", + "63260 http://dbpedia.org/ontology/family True Species \n", + "285621 http://dbpedia.org/ontology/associatedBand True Agent \n", + "61849 http://dbpedia.org/ontology/phylum True Species \n", + "20940 http://dbpedia.org/ontology/team False SportsTeam \n", + "688946 http://dbpedia.org/ontology/director True Film \n", + "1480502 http://dbpedia.org/ontology/genus True Species \n", + "\n", + " target \n", + "1804314 Halimuraena \n", + "216985 Christian County, Missouri \n", + "107055 Odonata \n", + "1650594 Eupoecilia \n", + "61042 Pachychilidae \n", + "1732647 Leptoneta \n", + "2084439 Triphora (gastropod) \n", + "654105 Jonathan Frakes \n", + "806033 Don Preston \n", + "611381 La Factoría \n", + "1509909 Mouse-eared bat \n", + "1226829 Apristus \n", + "385490 Arkansas County, Arkansas \n", + "795111 Krishna Sobti \n", + "63260 Libellulidae \n", + "285621 Midnight Oil \n", + "61849 Mollusca \n", + "20940 Joe Ledley \n", + "688946 George Hill (director) \n", + "1480502 Cybister " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_new_relations.sample(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetrelE_text
274906Sturgeon County< subdivision <Nywening
864833Anthyllis< genus <Anthyllis splendens
1209316Psilochilus< genus <Psilochilus vallecaucanus
228388Sumner County, Kansas< subdivision <Cicero
1671191Tethya< genus <Tethya irisae
288516Eaton County, Michigan> largestCity >Eaton Rapids
287348Boone County, Missouri> largestCity >Ashland
1758229Ormiscus< genus <Ormiscus vulgaris
1748675Sphecodes< genus <Sphecodes mutillaeformis
1787612Silis (beetle)< genus <Silis parallela
1157946Diplopterygium< genus <Diplopterygium conversum
841225Agrimonia< genus <Agrimonia parviflora
1945864Dermestes< genus <Dermestes patagoniensis
1197572Tamolanica< genus <Tamolanica katauana
268117Tupelo micropolitan area< subdivision <Tupelo
147551Promachus (fly)< genus <Promachus neligens
179891Zonaria> bandMember >Rickard Lundmark
1699982Melanorivulus< genus <Melanorivulus amambaiensis
816105Bacillus< genus <B. haloalkaliphilus
1219087Bubastes< genus <Bubastes blackburni
\n", + "
" + ], + "text/plain": [ + " target rel E_text\n", + "274906 Sturgeon County < subdivision < Nywening\n", + "864833 Anthyllis < genus < Anthyllis splendens\n", + "1209316 Psilochilus < genus < Psilochilus vallecaucanus\n", + "228388 Sumner County, Kansas < subdivision < Cicero\n", + "1671191 Tethya < genus < Tethya irisae\n", + "288516 Eaton County, Michigan > largestCity > Eaton Rapids\n", + "287348 Boone County, Missouri > largestCity > Ashland\n", + "1758229 Ormiscus < genus < Ormiscus vulgaris\n", + "1748675 Sphecodes < genus < Sphecodes mutillaeformis\n", + "1787612 Silis (beetle) < genus < Silis parallela\n", + "1157946 Diplopterygium < genus < Diplopterygium conversum\n", + "841225 Agrimonia < genus < Agrimonia parviflora\n", + "1945864 Dermestes < genus < Dermestes patagoniensis\n", + "1197572 Tamolanica < genus < Tamolanica katauana\n", + "268117 Tupelo micropolitan area < subdivision < Tupelo\n", + "147551 Promachus (fly) < genus < Promachus neligens\n", + "179891 Zonaria > bandMember > Rickard Lundmark\n", + "1699982 Melanorivulus < genus < Melanorivulus amambaiensis\n", + "816105 Bacillus < genus < B. haloalkaliphilus\n", + "1219087 Bubastes < genus < Bubastes blackburni" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_new_relations.sample(20)[['target', 'rel', 'E_text']]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# write new relations to file\n", + "\n", + "def serialize_relations(df, filename):\n", + " triples = []\n", + " for _, r in df.iterrows():\n", + " s, p, o = clg_util.name2clg_resource(r['target']), r['pred'].replace('dbpedia', 'caligraph'), clg_util.name2clg_resource(r['E_ent'])\n", + " if r['inv']:\n", + " s, o = o, s\n", + " triples.append(serialize_util.as_object_triple(s, p, o))\n", + " with open(filename, mode='w') as f:\n", + " f.writelines(triples)\n", + " \n", + "serialize_relations(df_new_relations, 'new_relations.nt')" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "CaLiGraph (from pipenv)", "language": "python", - "name": "python3" + "name": "caligraph" }, "language_info": { "codemirror_mode": {