TASK: Final documentation

nheist · Dec 15, 2019 · 6e163cf · 6e163cf
1 parent be3635a
commit 6e163cf
Show file tree

Hide file tree

Showing 8 changed files with 753 additions and 157 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/Pipfile b/Pipfile
@@ -18,7 +18,6 @@ nltk = "*"
 pynif = "*"
 tables = "*"
 xgboost = "*"
-owlready2 = "*"
 
 [dev-packages]
 

diff --git a/README.md b/README.md
@@ -1,44 +1,44 @@
 # CaLiGraph
 
-TODO: Intro-Text
+\- A Large Semantic Knowledge Graph from Wikipedia Categories and Listpages \-
 
-## Purpose
-todo
+For information about the general idea, extraction statistics, and resources of CaLiGraph, visit the [CaLiGraph website](http://caligraph.org).
 
 ## Configuration
 ### Prerequisites
 - Python 3
 - pipenv (https://pipenv.readthedocs.io/en/latest/)
 
+Note: If you have problems with your pipenv installation, you can also run the code directly via python. Just make sure to install all the dependencies given in `Pipfile` and `Pipfile.lock`. 
+
+### System Requirements
+- You need a machine with at least 100 GB of RAM as we load most of DBpedia in memory to speed up the extraction
+- During the first execution of an extraction you need a stable internet connection as the required DBpedia files are downloaded automatically 
+
 ### Setup
+- In the project source directory, create and initialize a virtual environment with pipenv (run in terminal):
 
-- Create virtual environment with pipenv
 ```
 pipenv install
 ```
 
-- Download the spacy corpus:
-```
-pipenv run python -m spacy download en_core_web_lg
-```
-
-- Download the wordnet corpus of nltk (run in python):
+- If you have not downloaded them already, you have to fetch the latest corpora for spaCy and nltk-wordnet (run in terminal):
 ```
-import nltk
-nltk.download('wordnet')
+pipenv run python -m spacy download en_core_web_lg            # download the most recent corpus of spaCy
+pipenv run python -c 'import nltk; nltk.download("wordnet")'  # download the wordnet corpus of ntlk
 ```
 
 ### Basic Configuration Options
 
-Use `config.yaml` for configuration of the application.
+You can configure the application-specific parameters as well as logging- and file-related parameters in `config.yaml`. 
 
 ## Usage
 
-- Run the application with pipenv:
+Run the extraction with pipenv:
+
 ```
-pipenv run .
+pipenv run python3 .
 ```
 
-## License
-MIT.
-https://opensource.org/licenses/MIT
+All the required resources, like DBpedia files, will be downloaded automatically during execution.
+CaLiGraph is serialized in N-Triple format. The resulting files are placed in the `results` folder.
diff --git a/__main__.py b/__main__.py
@@ -4,19 +4,11 @@
 import impl.category.base as cat_base
 import impl.category.cat2ax as cat_axioms
 import impl.util.hypernymy as hypernymy_util
-import impl.list.store as list_store
-import impl.list.base as list_base
-import impl.list.parser as list_parser
-import impl.list.mapping as list_mapping
-import impl.list.features as list_features
-#import impl.dbpedia.heuristics as dbp_heur
-#import impl.category.cat2ax as cat_axioms
-import impl.util.nlp as nlp_util
 import impl.caligraph.base as cali_base
-import impl.caligraph.serialize as cali_serialize
 
 
-def setup():
+def _setup_hypernyms():
+    """Initialisation of hypernyms that are extracted from Wikipedia categories using Cat2Ax axioms."""
     category_graph = cat_base.get_conceptual_category_graph()
 
     # initialise cat2ax axioms
@@ -31,65 +23,14 @@ def setup():
 
 if __name__ == '__main__':
     try:
-        util.get_logger().info('Starting caligraph v12..')
+        util.get_logger().info('Starting extraction of CaLiGraph version 1.')
 
-        #graph = cali_base.get_axiom_graph()
-        # recompute entity labels
-        #enum_features = list_base.get_enum_listpage_entity_features(graph)
-        #util.get_logger().debug('Before relabeling (enum)')
-        #util.get_logger().debug(f'True: {len(enum_features[enum_features["label"] == 1])}')
-        #util.get_logger().debug(f'False: {len(enum_features[enum_features["label"] == 0])}')
-        #util.get_logger().debug(f'New: {len(enum_features[enum_features["label"] == -1])}')
-#
-        #list_features.assign_entity_labels(graph, enum_features)
-        #util.get_logger().debug('After relabeling (enum)')
-        #util.get_logger().debug(f'True: {len(enum_features[enum_features["label"] == 1])}')
-        #util.get_logger().debug(f'False: {len(enum_features[enum_features["label"] == 0])}')
-        #util.get_logger().debug(f'New: {len(enum_features[enum_features["label"] == -1])}')
-        #util.update_cache('dbpedia_listpage_enum_features', enum_features, version=10)
-#
-        #table_features = list_base.get_table_listpage_entity_features(graph)
-        #util.get_logger().debug('Before relabeling (table)')
-        #util.get_logger().debug(f'True: {len(table_features[table_features["label"] == 1])}')
-        #util.get_logger().debug(f'False: {len(table_features[table_features["label"] == 0])}')
-        #util.get_logger().debug(f'New: {len(table_features[table_features["label"] == -1])}')
-#
-        #list_features.assign_entity_labels(graph, table_features)
-        #util.get_logger().debug('After relabeling (table)')
-        #util.get_logger().debug(f'True: {len(table_features[table_features["label"] == 1])}')
-        #util.get_logger().debug(f'False: {len(table_features[table_features["label"] == 0])}')
-        #util.get_logger().debug(f'New: {len(table_features[table_features["label"] == -1])}')
-        #util.update_cache('dbpedia_listpage_table_features', table_features, version=10)
+        _setup_hypernyms()  # initialise hypernyms
+        cali_base.serialize_final_graph()  # run the complete extraction cycle and end with serializing CaLiGraph
 
-
-
-        # extract table features
-        #list_base.get_table_listpage_entity_features()
-        #nlp_util.persist_cache()
-
-
-        # extract complete caligraph
-        #setup()
-        cali_base.serialize_final_graph()
-
-        #cat_graph = cat_base.get_merged_graph()
-        #util.get_logger().info('catgraph done.')
-        #util.get_logger().info('cache persist done.')
-        #list_graph = list_base.get_merged_listgraph()
-        #util.get_logger().info('listgraph done.')
-        #nlp_util.persist_cache()
-        #util.get_logger().info('cache persist done.')
-        #list_mapping.get_parent_categories('http://dbpedia.org/resource/Category:Lists_of_NASCAR_broadcasters')
-        #util.get_logger().info('mapping done.')
-        #nlp_util.persist_cache()
-        #util.get_logger().info('cache persist done.')
-        #list_base.get_listpage_entity_features()
-        #util.get_logger().info('extraction done.')
-        #nlp_util.persist_cache()
-        #util.get_logger().info('cache persist done.')
-
-        mailer.send_success(f'FINISHED caligraph v12')
-        util.get_logger().info('Finished caligraph v12.')
+        success_msg = 'Starting extraction of CaLiGraph version 1.'
+        mailer.send_success(success_msg)
+        util.get_logger().info(success_msg)
     except Exception as e:
         error_msg = traceback.format_exc()
         mailer.send_error(error_msg)

diff --git a/config.yaml b/config.yaml
@@ -1,3 +1,4 @@
+# -- CALIGRAPH CONFIGURATION --
 caligraph:
   version: '1.0'
   creation_date: '2019-10-01'
@@ -13,21 +14,25 @@ list:
 cat2ax:
   pattern_confidence: 0.05
 
-
+# -- LOGGING CONFIGURATION --
 logging:
   level: 'DEBUG'
   to_file: True
-  filename: 'caligraph_v12'
+  filename: 'caligraph_v1'
 
+# -- ERROR-MAILER CONFIGURATION --
+# -> add information about your mail accounts here and enable the `success` or `error` switches
+#    if you want to receive information about success or failure of the extraction by mail. <-
 mailer:
-  sender: 'errors.nheist@gmail.com'
-  password: 'Asdf1234!'
-  receiver: 'nico.heist@gmail.com'
+  sender: ~
+  password: ~
+  receiver: ~
   subject: 'CaLiGraph | '
   enable:
-    success: True
-    error: True
+    success: False
+    error: False
 
+# -- SOURCE FILE CONFIGURATION --
 files:
   dbpedia:
     pages:
@@ -90,14 +95,8 @@ files:
       filename: 'webisalod_hypernyms.p'
       url: 'http://data.dws.informatik.uni-mannheim.de/CaLiGraph/data/webisalod_hypernyms.p'
 
+# -- RESULT FILE CONFIGURATION --
 results:
-  catgraph:
-    dbp_type_sample: 'dbp-type-sample.csv'
-  cat2ax:
-    relation_axioms: 'cat2ax_relation-axioms.csv'
-    type_axioms: 'cat2ax_type-axioms.csv'
-    relation_assertions: 'cat2ax_relation-assertions.csv'
-    type_assertions: 'cat2ax_type-assertions.csv'
   caligraph:
     metadata: 'caligraph-metadata.nt.bz2'  # void description
     ontology: 'caligraph-ontology.nt.bz2'  # class hierarchy, labels
@@ -114,62 +113,56 @@ results:
     dbpedia_instance-types: 'dbpedia_caligraph-types.nt.bz2'  # new dbpedia types found through CaLiGraph
     dbpedia_instance-relations: 'dbpedia_caligraph-relations.nt.bz2'  #  new dbpedia relations found through CaLiGraph
 
+# -- CACHE FILE CONFIGURATION --
 cache:
-  # version 2: added resource types from sdtyped + lhd
-  # version 5: complete new run for fresh caligraph
-  # version 6: new extraction with improved headlemma matching
-  # version 7: recomputation due to improved disjointnesses, subtypes, headlemmas
-  # version 8: recomputation due to some minor improvements
-  # version 10: bunch of small fixes; better restrictions; well formatted labels
-  # version 11: military book bugfix; small fixes
   catgraph_conceptual:
     filename: 'catgraph-conceptual'
-    version: 11  # v2: do not remove unconnected nodes initially ; v3: better synonym matching
+    version: 1
   catgraph_wikitaxonomy:
     filename: 'catgraph-wikitaxonomy'
-    version: 11  # v2: do not remove unconnected nodes initially ; v3: better synonym matching
+    version: 1
   catgraph_cyclefree:
     filename: 'catgraph-cyclefree'
-    version: 11  # v2: do not remove unconnected nodes initially ; v3: better synonym matching
+    version: 1
   catgraph_merged:
     filename: 'catgraph-merged'
-    version: 11  # v2: do not remove unconnected nodes initially ; v3: better synonym matching
+    version: 1
   listgraph_base:
     filename: 'listgraph-base'
-    version: 11  # v2: better synonym matching ; v3: with all listpages
+    version: 1
   listgraph_wikitaxonomy:
     filename: 'listgraph-wikitaxonomy'
-    version: 11  # v2: better synonym matching ; v3: with all listpages
+    version: 1
   listgraph_cyclefree:
     filename: 'listgraph-cyclefree'
-    version: 11  # v2: better synonym matching ; v3: with all listpages
+    version: 1
   listgraph_merged:
     filename: 'listgraph-merged'
-    version: 11  # v2: better synonym matching ; v3: with all listpages
+    version: 1
   caligraph_base:
     filename: 'caligraph-base'
-    version: 11
+    version: 1
   caligraph_merged_ontology:
     filename: 'caligraph-merged-ontology'
-    version: 11
+    version: 1
   caligraph_filtered:
     filename: 'caligraph-filtered'
-    version: 11
+    version: 1
   caligraph_axiomatized:
     filename: 'caligraph-axiomatized'
-    version: 12
+    version: 1
   spacy_docs:
     filename: 'spacy-docs'
-    version: 3  # version 2 incl. listpages; version 3 for spacy 2.1
+    version: 1
   dbpedia_resources:
     filename: 'dbpedia-resources'
     version: 1
   dbpedia_resource_type_frequency:
     filename: 'dbpedia-resource-type-frequency'
-    version: 7
+    version: 1
   dbpedia_resource_type_mapping:
     filename: 'dbpedia-resource-type-mapping'
-    version: 7
+    version: 1
   dbpedia_resource_labels:
     filename: 'dbpedia-resource-labels'
     version: 1
@@ -200,7 +193,7 @@ cache:
     compress: True
   dbpedia_functional_predicates:
     filename: 'dbpedia-functional-predicates'
-    version: 2  # version 1: strict; version 2: 5% tolerance
+    version: 1
   dbpedia_categories:
     filename: 'dbpedia-categories'
     version: 1
@@ -212,13 +205,13 @@ cache:
     version: 1
   dbpedia_category_statistics:
     filename: 'dbpedia-category-statistics'
-    version: 7
+    version: 1
   dbpedia_category_sets:
     filename: 'dbpedia-category-sets'
-    version: 11  # new conceptual categorygraph version (v2)
+    version: 1
   dbpedia_categories_conceptual:
     filename: 'dbpedia-category-conceptual'
-    version: 11
+    version: 1
   dbpedia_heuristic_domains:
     filename: 'dbpedia-heuristic-domains'
     version: 1
@@ -227,33 +220,33 @@ cache:
     version: 1
   dbpedia_heuristic_disjoint_types:
     filename: 'dbpedia-heuristic-disjoint-types'
-    version: 7
+    version: 1
   dbpedia_listpages:
     filename: 'dbpedia-listpages'
-    version: 11
+    version: 1
   dbpedia_list_equivalents:
     filename: 'dbpedia-list-equivalents'
-    version: 11  # v2: better synonym matching
+    version: 1
   dbpedia_list_parents:
     filename: 'dbpedia-list-parents'
-    version: 11  # v2: better synonym matching
+    version: 1
   dbpedia_listpage_markup:
     filename: 'dbpedia-listpage-markup'
     version: 1
   dbpedia_listpage_parsed:
     filename: 'dbpedia-listpage-parsed'
-    version: 3  # v1: basic; v2: with entity indices; v3: with tables
+    version: 1
   dbpedia_listpage_enum_features:
     filename: 'dbpedia-listpage-enum-features'
-    version: 10
+    version: 1
     store_as_hdf: True
   dbpedia_listpage_table_features:
     filename: 'dbpedia-listpage-table-features'
-    version: 10
+    version: 1
     store_as_csv: True
   dbpedia_listpage_entities:
     filename: 'dbpedia-listpage-entities'
-    version: 11
+    version: 1
   wikidata_dbpedia_resource_mapping:
     filename: 'wikidata-dbpedia-resource-mapping'
     version: 1
@@ -264,7 +257,7 @@ cache:
     compress: True
   wikitaxonomy_hypernyms:
     filename: 'wikitaxonomy-hypernyms'
-    version: 11  # version 1 -> thresholds 10 / 100 / .4  // v2: new cat2ax version (v2)
+    version: 1
   cat2ax_axioms:
     filename: 'cat2ax-axioms'
-    version: 11  # new conceptual categorygraph version (v2)
+    version: 1
diff --git a/mailer.py b/mailer.py
@@ -1,3 +1,5 @@
+"""Mailer to inform about success or failure of the extraction."""
+
 import smtplib
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText

diff --git a/type_lexicalisations.py b/type_lexicalisations.py
@@ -1,3 +1,9 @@
+"""Extraction of type lexicalisations from the Wikipedia corpus.
+
+The resulting cache file is already placed at `data/cache/dbpedia-type-lexicalisations_v1.p.bz2`
+but can be recomputed with this script.
+"""
+
 import util
 from typing import Tuple
 import re