diff --git a/.gitignore b/.gitignore index 50bce15..e6ddf0c 100644 --- a/.gitignore +++ b/.gitignore @@ -117,4 +117,4 @@ doc/_build notebooks/ models .DS_Store -venv \ No newline at end of file +venv diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..373dbbd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +repos: +- repo: https://github.com/ambv/black + rev: 21.6b0 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.9.1 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files"] +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: debug-statements + - id: check-docstring-first + - id: check-yaml + - id: check-added-large-files + - id: pretty-format-json + args: ["--autofix"] +- repo: https://github.com/asottile/pyupgrade + rev: v2.20.0 + hooks: + - id: pyupgrade +- repo: https://github.com/PyCQA/flake8 + rev: 3.9.2 + hooks: + - id: flake8 diff --git a/LICENSE b/LICENSE index aae2f33..281d399 100644 --- a/LICENSE +++ b/LICENSE @@ -616,4 +616,4 @@ above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. \ No newline at end of file +copy of the Program in return for a fee. diff --git a/doc/_static/loss_functions/loss_avg.csv b/doc/_static/loss_functions/loss_avg.csv index 3636332..bfeb425 100644 --- a/doc/_static/loss_functions/loss_avg.csv +++ b/doc/_static/loss_functions/loss_avg.csv @@ -1600,4 +1600,4 @@ "7990","7995","0.2672945559024811","undefined","7995","0.34593504667282104","undefined","7995","0.3321243226528168","undefined","7995","0.35803699493408203","undefined","7995","0.3589259684085846","undefined" "7995","8000","0.33078715205192566","undefined","8000","0.30306801199913025","undefined","8000","0.3155066967010498","undefined","8000","0.35373300313949585","undefined","8000","0.3202584385871887","undefined" "8000","8005","0.32276827096939087","undefined","8005","0.35233697295188904","undefined","8005","0.344290554523468","undefined","8005","0.33728644251823425","undefined","8005","0.29650598764419556","undefined" -"8005","8010","0.3273674249649048","undefined","8010","0.3167572319507599","undefined","8010","0.3407208025455475","undefined","8010","0.3501826226711273","undefined","8010","0.34778228402137756","undefined" \ No newline at end of file +"8005","8010","0.3273674249649048","undefined","8010","0.3167572319507599","undefined","8010","0.3407208025455475","undefined","8010","0.3501826226711273","undefined","8010","0.34778228402137756","undefined" diff --git a/doc/_static/loss_functions/loss_avg_seq_mean.csv b/doc/_static/loss_functions/loss_avg_seq_mean.csv index 8931a0b..317bd57 100644 --- a/doc/_static/loss_functions/loss_avg_seq_mean.csv +++ b/doc/_static/loss_functions/loss_avg_seq_mean.csv @@ -1600,4 +1600,4 @@ "7990","7995","0.2740124464035034","undefined","7995","0.35077494382858276","undefined","7995","0.3478199243545532","undefined","7995","0.35866016149520874","undefined","7995","0.3791751563549042","undefined" "7995","8000","0.34686484932899475","undefined","8000","0.31498754024505615","undefined","8000","0.3321347236633301","undefined","8000","0.3559577465057373","undefined","8000","0.33466649055480957","undefined" "8000","8005","0.3251478672027588","undefined","8005","0.3561927378177643","undefined","8005","0.36540666222572327","undefined","8005","0.3430340886116028","undefined","8005","0.30841362476348877","undefined" -"8005","8010","0.33854520320892334","undefined","8010","0.32373565435409546","undefined","8010","0.3489498198032379","undefined","8010","0.3549184799194336","undefined","8010","0.3679059147834778","undefined" \ No newline at end of file +"8005","8010","0.33854520320892334","undefined","8010","0.32373565435409546","undefined","8010","0.3489498198032379","undefined","8010","0.3549184799194336","undefined","8010","0.3679059147834778","undefined" diff --git a/doc/_static/loss_functions/loss_avg_seq_sum.csv b/doc/_static/loss_functions/loss_avg_seq_sum.csv index ed994a9..019c585 100644 --- a/doc/_static/loss_functions/loss_avg_seq_sum.csv +++ b/doc/_static/loss_functions/loss_avg_seq_sum.csv @@ -1600,4 +1600,4 @@ "7990","7995","8.76839828491211","undefined","7995","11.224798202514648","undefined","7995","11.130237579345703","undefined","7995","11.47712516784668","undefined","7995","12.133605003356934","undefined" "7995","8000","11.099675178527832","undefined","8000","10.079601287841797","undefined","8000","10.628311157226562","undefined","8000","11.390647888183594","undefined","8000","10.709327697753906","undefined" "8000","8005","10.404731750488281","undefined","8005","11.398167610168457","undefined","8005","11.693013191223145","undefined","8005","10.977090835571289","undefined","8005","9.86923599243164","undefined" -"8005","8010","10.833446502685547","undefined","8010","10.359540939331055","undefined","8010","11.166394233703613","undefined","8010","11.357391357421875","undefined","8010","11.772989273071289","undefined" \ No newline at end of file +"8005","8010","10.833446502685547","undefined","8010","10.359540939331055","undefined","8010","11.166394233703613","undefined","8010","11.357391357421875","undefined","8010","11.772989273071289","undefined" diff --git a/doc/_static/loss_functions/loss_total.csv b/doc/_static/loss_functions/loss_total.csv index e6cdfe9..616055e 100644 --- a/doc/_static/loss_functions/loss_total.csv +++ b/doc/_static/loss_functions/loss_total.csv @@ -1600,4 +1600,4 @@ "7990","7995","198.599853515625","undefined","7995","246.30575561523438","undefined","7995","187.65023803710938","undefined","7995","218.76060485839844","undefined","7995","202.7931671142578","undefined" "7995","8000","226.91998291015625","undefined","8000","192.14512634277344","undefined","8000","192.14356994628906","undefined","8000","219.31446838378906","undefined","8000","189.59300231933594","undefined" "8000","8005","211.0904541015625","undefined","8005","221.9722900390625","undefined","8005","215.87017822265625","undefined","8005","213.1650390625","undefined","8005","194.21142578125","undefined" -"8005","8010","212.13409423828125","undefined","8010","188.47055053710938","undefined","8010","214.9948272705078","undefined","8010","219.21432495117188","undefined","8010","211.79940795898438","undefined" \ No newline at end of file +"8005","8010","212.13409423828125","undefined","8010","188.47055053710938","undefined","8010","214.9948272705078","undefined","8010","219.21432495117188","undefined","8010","211.79940795898438","undefined" diff --git a/doc/_static/loss_functions/loss_total_norm_batch.csv b/doc/_static/loss_functions/loss_total_norm_batch.csv index d4963f9..d9158f5 100644 --- a/doc/_static/loss_functions/loss_total_norm_batch.csv +++ b/doc/_static/loss_functions/loss_total_norm_batch.csv @@ -1600,4 +1600,4 @@ "7990","7995","6.206245422363281","undefined","7995","7.697054862976074","undefined","7995","5.864069938659668","undefined","7995","6.836268901824951","undefined","7995","6.337286472320557","undefined" "7995","8000","7.091249465942383","undefined","8000","6.00453519821167","undefined","8000","6.004486560821533","undefined","8000","6.853577136993408","undefined","8000","5.924781322479248","undefined" "8000","8005","6.596576690673828","undefined","8005","6.936634063720703","undefined","8005","6.745943069458008","undefined","8005","6.661407470703125","undefined","8005","6.0691070556640625","undefined" -"8005","8010","6.629190444946289","undefined","8010","5.889704704284668","undefined","8010","6.718588352203369","undefined","8010","6.850447654724121","undefined","8010","6.618731498718262","undefined" \ No newline at end of file +"8005","8010","6.629190444946289","undefined","8010","5.889704704284668","undefined","8010","6.718588352203369","undefined","8010","6.850447654724121","undefined","8010","6.618731498718262","undefined" diff --git a/doc/_static/summarization-model-experiments-raw-data.csv b/doc/_static/summarization-model-experiments-raw-data.csv index bcd24f7..238443e 100644 --- a/doc/_static/summarization-model-experiments-raw-data.csv +++ b/doc/_static/summarization-model-experiments-raw-data.csv @@ -70,4 +70,4 @@ "loss-test_avg_seq_mean","2020-04-17T02:34:16.000Z","13849","loss-func-test-old","","2020-11-19T22:45:56.000Z","239269211beb","","","","","","","","","","3h7bb65i","finished","-","hhousen","","2020-04-16T22:43:27.000Z","","2","1e-8","O1","","1","","0.1","","binary","./cnn_dm_pt/bert-base-uncased","true","true","-1","1","0.00002","false","false","INFO","","loss_avg_seq_mean","3","512","1","","distilbert-base-uncased","bert","","true","","0","5","4","false","adamw","none","0.6","sent_rep_tokens","32","","2","true","true","50","6","","32","top_k","3","test","","","false","","","32","train","","false","wandb","linear","32","val","","1800","0.01","","","","","","","","","","","","","","","","","","","","","8005","","","","","","","","","","","","","0.3167572319507599","0.32373565435409546","10.359540939331056","188.47055053710935","5.889704704284668","0.918835997581482","0.5851296782493591","0.25142356753349304","0.30322131514549255","0.31995004415512085","10.238401412963867","176.49374389648438","5.515429496765137","" "loss-test_avg_seq_sum","2020-04-16T20:38:12.000Z","14230","loss-func-test-old","","2020-11-19T22:45:56.000Z","5f1cc95c99b6","","","","","","","","","","1nhtgv2c","finished","-","hhousen","","2020-04-16T16:41:02.000Z","","2","1e-8","O1","","1","","0.1","","binary","./cnn_dm_pt/bert-base-uncased","true","true","-1","1","0.00002","false","false","INFO","","loss_avg_seq_sum","3","512","1","","distilbert-base-uncased","bert","","true","","0","5","4","false","adamw","none","0.6","sent_rep_tokens","32","","2","true","true","50","6","","32","top_k","3","test","","","false","","","32","train","","false","wandb","linear","32","val","","1800","0.01","","","","","","","","","","","","","","","","","","","","","8005","","","","","","","","","","","","","0.3407208025455475","0.3489498198032379","11.166394233703612","214.9948272705078","6.718588352203369","0.9136065244674684","0.5195010304450989","0.125395268201828","0.31573837995529175","0.3229138255119324","10.333242416381836","187.5449981689453","5.860781192779541","" "loss-test_total_norm_batch","2020-04-16T08:36:54.000Z","13855","loss-func-test-old","","2020-11-19T22:45:56.000Z","17e694aceeed","","","","","","","","","","39586k7k","finished","-","hhousen","","2020-04-16T04:45:59.000Z","","2","1e-8","O1","","1","","0.1","","binary","./cnn_dm_pt/bert-base-uncased","true","true","-1","1","0.00002","false","false","INFO","","loss_total_norm_batch","3","512","1","","distilbert-base-uncased","bert","","true","","0","5","4","false","adamw","none","0.6","sent_rep_tokens","32","","2","true","true","50","6","","32","top_k","3","test","","","false","","","32","train","","false","wandb","linear","32","val","","1800","0.01","","","","","","","","","","","","","","","","","","","","","8005","","","","","","","","","","","","","0.3501826226711273","0.3549184799194336","11.357391357421877","219.21432495117188","6.850447654724121","0.91948664188385","0.5414910316467285","0.16349512338638306","0.30516937375068665","0.3220357596874237","10.30514430999756","177.595458984375","5.549858093261719","" -"loss-test_total","2020-04-16T04:44:56.000Z","13842","loss-func-test-old","","2020-11-19T22:45:56.000Z","17e694aceeed","","","","","","","","","","1lto2dd2","finished","-","hhousen","","2020-04-16T00:54:14.000Z","","2","1e-8","O1","","1","","0.1","","binary","./cnn_dm_pt/bert-base-uncased","true","true","-1","1","0.00002","false","false","INFO","","loss_total","3","512","1","","distilbert-base-uncased","bert","","true","","0","5","4","false","adamw","none","0.6","sent_rep_tokens","32","","2","true","true","50","6","","32","top_k","3","test","","","false","","","32","train","","false","wandb","linear","32","val","","1800","0.01","","","","","","","","","","","","","","","","","","","","","8005","","","","","","","","","","","","","0.34778228402137756","0.3679059147834778","11.772989273071287","211.7994079589844","6.618731498718262","0.919694483280182","0.5366775989532471","0.15366090834140778","0.3051919937133789","0.32205986976623535","10.305915832519531","177.60853576660156","5.550266742706299","" \ No newline at end of file +"loss-test_total","2020-04-16T04:44:56.000Z","13842","loss-func-test-old","","2020-11-19T22:45:56.000Z","17e694aceeed","","","","","","","","","","1lto2dd2","finished","-","hhousen","","2020-04-16T00:54:14.000Z","","2","1e-8","O1","","1","","0.1","","binary","./cnn_dm_pt/bert-base-uncased","true","true","-1","1","0.00002","false","false","INFO","","loss_total","3","512","1","","distilbert-base-uncased","bert","","true","","0","5","4","false","adamw","none","0.6","sent_rep_tokens","32","","2","true","true","50","6","","32","top_k","3","test","","","false","","","32","train","","false","wandb","linear","32","val","","1800","0.01","","","","","","","","","","","","","","","","","","","","","8005","","","","","","","","","","","","","0.34778228402137756","0.3679059147834778","11.772989273071287","211.7994079589844","6.618731498718262","0.919694483280182","0.5366775989532471","0.15366090834140778","0.3051919937133789","0.32205986976623535","10.305915832519531","177.60853576660156","5.550266742706299","" diff --git a/doc/abstractive/api.rst b/doc/abstractive/api.rst index 57b9eb1..fdaf8ad 100644 --- a/doc/abstractive/api.rst +++ b/doc/abstractive/api.rst @@ -4,4 +4,4 @@ Abstractive API Reference Model/Module ------------ -.. automodule:: abstractive \ No newline at end of file +.. automodule:: abstractive diff --git a/doc/abstractive/training.rst b/doc/abstractive/training.rst index ddcb780..3634671 100644 --- a/doc/abstractive/training.rst +++ b/doc/abstractive/training.rst @@ -70,7 +70,7 @@ Step-by-Step Instructions 2. Extract (≈90GB): ``tar -xzvf longformer-encdec-base-8192.tar.gz`` 3. Training command: - .. code-block:: + .. code-block:: python main.py \ --mode abstractive \ diff --git a/doc/extractive/api.rst b/doc/extractive/api.rst index d34a0e5..602c92b 100644 --- a/doc/extractive/api.rst +++ b/doc/extractive/api.rst @@ -27,4 +27,3 @@ Convert To Extractive --------------------- .. automodule:: convert_to_extractive - diff --git a/doc/extractive/convert-to-extractive.rst b/doc/extractive/convert-to-extractive.rst index 39fd66a..77d7780 100644 --- a/doc/extractive/convert-to-extractive.rst +++ b/doc/extractive/convert-to-extractive.rst @@ -23,7 +23,7 @@ Simply run ``convert_to_extractive.py`` with the path to the data. For example, * ``--shard_interval`` processes the file in chunks of ``5000`` and writes results to disk in chunks of ``5000`` (saves RAM) * ``--compression`` compresses each output chunk with gzip (depending on the dataset reduces space usage requirement by about 1/2 to 1/3) -* ``--add_target_to`` will save the abstractive target text to the splits (in ``--split_names``) specified. +* ``--add_target_to`` will save the abstractive target text to the splits (in ``--split_names``) specified. The default output directory is the input directory that was specified, but the output directory can be changed with ``--base_output_path`` if desired. @@ -36,7 +36,7 @@ Option 2: Automatic pre-processing through ``nlp`` You will need to run the ``convert_to_extractive.py`` command with the ``--dataset``, ``--dataset_version``, ``--data_example_column``, and ``--data_summarized_column`` options set. To use the CNN/DM dataset you would set these arguments as shown below: -.. code-block:: +.. code-block:: --dataset cnn_dailymail \ --dataset_version 3.0.0 \ @@ -66,7 +66,7 @@ Extractive Dataset Format This section briefly discusses the format of datasets created by the ``convert_to_extractive`` script. -The training and validation sets only need the ``src`` and ``labels`` keys saved as json. The ``src`` value should be a list of lists where each list contains a series of tokens (see below). The ``labels`` value is a list of 0s (not in summary) and 1s (sentence should be in summary) that is the same length as the ``src`` value (the number of sentences). Each value in this list corresponds to a sentence in ``src``. The testing set is special because it needs the ``src``, ``labels``, and ``tgt`` keys. The ``tgt`` key represents the target summary as a single string with a ```` between each sentence. +The training and validation sets only need the ``src`` and ``labels`` keys saved as json. The ``src`` value should be a list of lists where each list contains a series of tokens (see below). The ``labels`` value is a list of 0s (not in summary) and 1s (sentence should be in summary) that is the same length as the ``src`` value (the number of sentences). Each value in this list corresponds to a sentence in ``src``. The testing set is special because it needs the ``src``, ``labels``, and ``tgt`` keys. The ``tgt`` key represents the target summary as a single string with a ```` between each sentence. First document in **CNN/DM** extractive **training** set: @@ -151,4 +151,4 @@ Script Help --max_example_nsents MAX_EXAMPLE_NSENTS maximum number of sentences per example -l {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log {DEBUG,INFO,WARNING,ERROR,CRITICAL} - Set the logging level (default: 'Info'). \ No newline at end of file + Set the logging level (default: 'Info'). diff --git a/doc/extractive/experiments.rst b/doc/extractive/experiments.rst index 9657c18..5e268f8 100644 --- a/doc/extractive/experiments.rst +++ b/doc/extractive/experiments.rst @@ -95,7 +95,7 @@ Commit `dfefd15` added a :class:`~classifier.SimpleLinearClassifier`. This exper Command used to run the tests: -.. code-block:: +.. code-block:: python main.py \ --model_name_or_path distilbert-base-uncased \ @@ -697,4 +697,4 @@ Classifier/Encoder Results **Relative Time:** .. image:: ../_static/encoder/loss_avg_seq_mean_reltime.png - :width: 48% \ No newline at end of file + :width: 48% diff --git a/doc/extractive/models-results.rst b/doc/extractive/models-results.rst index bd90161..828f4c4 100644 --- a/doc/extractive/models-results.rst +++ b/doc/extractive/models-results.rst @@ -3,7 +3,7 @@ Extractive Pre-trained Models & Results ======================================= -The recommended model to use is ``distilroberta-base-ext-sum`` because of its fast performance, relatively low number of parameters, and good performance. +The recommended model to use is ``distilroberta-base-ext-sum`` because of its fast performance, relatively low number of parameters, and good performance. Notes ----- @@ -14,7 +14,7 @@ The remarkable performance to size ratio of the distil* models can be transferre `MobileBERT `_ is similar to ``distilbert`` in that it is a smaller version of BERT that achieves amazing performance at a very small size. `According to the authors `__, MobileBERT is *2.64x smaller and 2.45x faster* than DistilBERT. DistilBERT successfully halves the depth of BERT model by knowledge distillation in the pre-training stage and an optional fine-tuning stage. MobileBERT only uses knowledge transfer in the pre-training stage and does not require a fine-tuned teacher or data augmentation in the down-stream tasks. DistilBERT compresses BERT by reducing its depth, while MobileBERT compresses BERT by reducing its width, which has been shown to be more effective. MobileBERT usually needs a larger learning rate and more training epochs in fine-tuning than the original BERT. -.. important:: Interactive charts, graphs, raw data, run commands, hyperparameter choices, and more for all trained models are publicly available on the `TransformerSum Weights & Biases page `__. You can download the raw data for each model on this site, or `download an overview as a CSV <../_static/summarization-model-experiments-raw-data.csv>`__. Please open an `issue `__ if you have questions about these models. +.. important:: Interactive charts, graphs, raw data, run commands, hyperparameter choices, and more for all trained models are publicly available on the `TransformerSum Weights & Biases page `__. You can download the raw data for each model on this site, or `download an overview as a CSV <../_static/summarization-model-experiments-raw-data.csv>`__. Please open an `issue `__ if you have questions about these models. Additionally, all of the models on this page were trained completely for free using Tesla P100-PCIE-16GB GPUs on `Google Colaboratory `_. Those that took over 12 hours to train were split into multiple training sessions since ``pytorch_lightning`` enables easy resuming with the ``--resume_from_checkpoint`` argument. @@ -130,7 +130,7 @@ Test set results on the WikiHow dataset using ROUGE F\ :sub:`1`\ . +---------------------------------+------------+------------+------------+-------------+ | distilroberta-base-ext-sum | 31.07 | 8.96 | 19.34 | 28.95 | +---------------------------------+------------+------------+------------+-------------+ -| bert-base-uncased-ext-sum | 30.68 | 08.67 | 19.16 | 28.59 | +| bert-base-uncased-ext-sum | 30.68 | 08.67 | 19.16 | 28.59 | +---------------------------------+------------+------------+------------+-------------+ | roberta-base-ext-sum | 31.26 | 09.09 | 19.47 | 29.14 | +---------------------------------+------------+------------+------------+-------------+ diff --git a/doc/extractive/training.rst b/doc/extractive/training.rst index 3ec15ba..2446579 100644 --- a/doc/extractive/training.rst +++ b/doc/extractive/training.rst @@ -6,7 +6,7 @@ Training an Extractive Summarization Model Details ------- -Once the dataset has been converted to the extractive task, it can be used as input to a :class:`data.SentencesProcessor`, which has a :meth:`~data.SentencesProcessor.add_examples()` function to add sets of ``(example, labels)`` and a :meth:`~data.SentencesProcessor.get_features()` function that processes the data and prepares it to be inputted into the model (``input_ids``, ``attention_masks``, ``labels``, ``token_type_ids``, ``sent_rep_token_ids``, ``sent_rep_token_ids_masks``). Feature extraction runs in parallel and tokenizes text using the tokenizer appropriate for the model specified with ``--model_name_or_path``. The tokenizer can be changed to another ``huggingface/transformers`` tokenizer with the ``--tokenizer_name`` option. +Once the dataset has been converted to the extractive task, it can be used as input to a :class:`data.SentencesProcessor`, which has a :meth:`~data.SentencesProcessor.add_examples()` function to add sets of ``(example, labels)`` and a :meth:`~data.SentencesProcessor.get_features()` function that processes the data and prepares it to be inputted into the model (``input_ids``, ``attention_masks``, ``labels``, ``token_type_ids``, ``sent_rep_token_ids``, ``sent_rep_token_ids_masks``). Feature extraction runs in parallel and tokenizes text using the tokenizer appropriate for the model specified with ``--model_name_or_path``. The tokenizer can be changed to another ``huggingface/transformers`` tokenizer with the ``--tokenizer_name`` option. For the :ref:`CNN/DM dataset `, to train a model for 50,000 steps on the data run: diff --git a/doc/general/about.rst b/doc/general/about.rst index 51580b5..d1e4003 100644 --- a/doc/general/about.rst +++ b/doc/general/about.rst @@ -59,7 +59,7 @@ This project integrates with `rouge-score `_ and ran the below `commands from Kavita Ganesan `_ (`Archive `__) to fix the WordNet exceptions: -.. code-block:: +.. code-block:: cd data/WordNet-2.0-Exceptions/ ./buildExeptionDB.pl . exc WordNet-2.0.exc.db diff --git a/doc/general/api.rst b/doc/general/api.rst index ce422a9..4abc22a 100644 --- a/doc/general/api.rst +++ b/doc/general/api.rst @@ -6,4 +6,4 @@ This page contains the API reference for modules that contain code used for both Helpers ------- -.. automodule:: helpers \ No newline at end of file +.. automodule:: helpers diff --git a/doc/general/getting-started.rst b/doc/general/getting-started.rst index c45aedb..12f534e 100644 --- a/doc/general/getting-started.rst +++ b/doc/general/getting-started.rst @@ -31,7 +31,7 @@ If all you want to do is summarize a text string using a pre-trained model then 1. Download a summarization model. Link to :ref:`pre-trained extractive models `. Link to :ref:`pre-trained abstractive models `. 2. Put the model in a folder named ``models`` in the project root. -3. Run ``python predictions_website.py`` and open the link. +3. Run ``python predictions_website.py`` and open the link. 4. On the website enter your text, select your downloaded model, and click "SUBMIT". Programmatically @@ -59,7 +59,7 @@ If you want to summarize text using a pre-trained model from python code then fo 3. Run prediction on a string of text: .. code-block:: python - + text_to_summarize = "Something Awesome" summary = model.predict(text_to_summarize) @@ -85,15 +85,15 @@ To be clear, this is an abstractive dataset so we will convert it to the extract Command to convert dataset to extractive (:ref:`more info `): -.. code-block:: +.. code-block:: python convert_to_extractive.py ./datasets/cnn_dailymail_processor/cnn_dm --shard_interval 5000 --compression --add_target_to test -Once we have an extractive dataset, we need to convert the text into features that the computer can understand. This includes ``input_ids``, ``attention_mask``, ``sent_rep_token_ids``, and more. The :meth:`extractive.ExtractiveSummarizer.forward` and :meth:`data.SentencesProcessor.get_features` docstrings explains these features nicely. The `huggingface/transformers glossary `_ is a good resource as well. This conversion to model-specific features happens automatically before training begins. Since the features are model-specific, the training script is responsible for converting the data. It creates a :class:`~data.SentencesProcessor` that does most of the heavy lifting. You can learn more about this automatic preprocessing on the :ref:`data_automatic_preprocessing` page. +Once we have an extractive dataset, we need to convert the text into features that the computer can understand. This includes ``input_ids``, ``attention_mask``, ``sent_rep_token_ids``, and more. The :meth:`extractive.ExtractiveSummarizer.forward` and :meth:`data.SentencesProcessor.get_features` docstrings explains these features nicely. The `huggingface/transformers glossary `_ is a good resource as well. This conversion to model-specific features happens automatically before training begins. Since the features are model-specific, the training script is responsible for converting the data. It creates a :class:`~data.SentencesProcessor` that does most of the heavy lifting. You can learn more about this automatic preprocessing on the :ref:`data_automatic_preprocessing` page. Command to only pre-process the data and stop right before training would begin (:ref:`more info `): -.. code-block:: +.. code-block:: python main.py --data_path ./datasets/cnn_dailymail_processor/cnn_dm --use_logger tensorboard --model_name_or_path bert-base-uncased --model_type bert --do_train --only_preprocess @@ -101,7 +101,7 @@ If you didn't run the above commands then download the ``bert-base-uncased-ext-s Training command: -.. code-block:: +.. code-block:: python main.py \ --model_name_or_path bert-base-uncased \ @@ -123,7 +123,7 @@ You can learn more about the above command on :ref:`train_extractive_model`. Abstractive Summarization ^^^^^^^^^^^^^^^^^^^^^^^^^ -Lets train a model that performs abstractive summarization. Whereas autoencoding models are used for extractive summarization, sequence-to-sequence (seq2seq) models are used for abstractive summarization. In short, autoregressive models correspond to the decoder of the original transformer model, autoencoding models correspond to the encoder, and sequence-to-sequence models use both the encoder and the decoder of the original transformer. +Lets train a model that performs abstractive summarization. Whereas autoencoding models are used for extractive summarization, sequence-to-sequence (seq2seq) models are used for abstractive summarization. In short, autoregressive models correspond to the decoder of the original transformer model, autoencoding models correspond to the encoder, and sequence-to-sequence models use both the encoder and the decoder of the original transformer. .. note:: Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their most natural applications are translation, summarization and question answering. The original transformer model is an example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks. diff --git a/environment.yml b/environment.yml index f5224bd..c212351 100644 --- a/environment.yml +++ b/environment.yml @@ -9,6 +9,7 @@ dependencies: - spacy - sphinx - pyarrow + - pre-commit - pip - pip: - pytorch_lightning @@ -18,4 +19,4 @@ dependencies: - rouge-score - packaging - datasets - - gradio \ No newline at end of file + - gradio diff --git a/example_deepspeed_config.json b/example_deepspeed_config.json index 73d7a36..4dc5fe8 100644 --- a/example_deepspeed_config.json +++ b/example_deepspeed_config.json @@ -1,30 +1,33 @@ { - "zero_allow_untested_optimizer": true, - "optimizer": { - "type": "OneBitAdam", - "params": { - "lr": 2e-5, - "betas": [0.998, 0.999], - "eps": 1e-8, - "weight_decay": 1e-2, - "cuda_aware": true - } + "optimizer": { + "params": { + "betas": [ + 0.998, + 0.999 + ], + "cuda_aware": true, + "eps": 1e-08, + "lr": 2e-05, + "weight_decay": 0.01 }, - "scheduler": { - "type": "WarmupLR", - "params": { - "last_batch_iteration": -1, - "warmup_min_lr": 0, - "warmup_max_lr": 2e-5, - "warmup_num_steps": 2000 - } + "type": "OneBitAdam" + }, + "scheduler": { + "params": { + "last_batch_iteration": -1, + "warmup_max_lr": 2e-05, + "warmup_min_lr": 0, + "warmup_num_steps": 2000 }, - "zero_optimization": { - "stage": 2, - "cpu_offload": true, - "contiguous_gradients": true, - "overlap_comm": true, - "allgather_bucket_size": 2e8, - "reduce_bucket_size": 2e8 - } -} \ No newline at end of file + "type": "WarmupLR" + }, + "zero_allow_untested_optimizer": true, + "zero_optimization": { + "allgather_bucket_size": 200000000.0, + "contiguous_gradients": true, + "cpu_offload": true, + "overlap_comm": true, + "reduce_bucket_size": 200000000.0, + "stage": 2 + } +} diff --git a/predictions_website.py b/predictions_website.py index 1b62063..7e0dc74 100644 --- a/predictions_website.py +++ b/predictions_website.py @@ -1,10 +1,11 @@ +import glob import os import sys -import glob + import gradio as gr sys.path.insert(0, os.path.abspath("./src")) -from extractive import ExtractiveSummarizer +from extractive import ExtractiveSummarizer # noqa: E402 def summarize_text(text, model_choice): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..95efa50 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.isort] +profile = "black" +multi_line_output = 3 +line_length = 88 diff --git a/scripts/convert_extractive_pt_to_txt.py b/scripts/convert_extractive_pt_to_txt.py index b27b079..77c9a8b 100644 --- a/scripts/convert_extractive_pt_to_txt.py +++ b/scripts/convert_extractive_pt_to_txt.py @@ -1,16 +1,21 @@ -from argparse import ArgumentParser import os +from argparse import ArgumentParser + import torch from tqdm import tqdm + def convert_extractive_pt_to_txt(path): full_directory_listing = os.listdir(path) - all_pt_files = [x for x in full_directory_listing if os.path.splitext(x)[1] == ".pt"] + all_pt_files = [ + x for x in full_directory_listing if os.path.splitext(x)[1] == ".pt" + ] for file_path in tqdm(all_pt_files, desc="Converting PT to TXT"): file_path = os.path.join(path, file_path) torch_data = torch.load(file_path) with open(file_path[:-2] + "txt", "w+") as file: - file.write('\n'.join([str(x).replace("'", '"') for x in torch_data]) + '\n') + file.write("\n".join([str(x).replace("'", '"') for x in torch_data]) + "\n") + if __name__ == "__main__": parser = ArgumentParser() diff --git a/scripts/convert_to_arrow.py b/scripts/convert_to_arrow.py index 54cc5aa..64e97b8 100644 --- a/scripts/convert_to_arrow.py +++ b/scripts/convert_to_arrow.py @@ -1,6 +1,8 @@ from argparse import ArgumentParser + import pyarrow as pa from pyarrow import json + import datasets as nlp @@ -55,7 +57,8 @@ def convert_to_arrow( "--cache_path_prefix", type=str, default="./data_chunk", - help="The cache path and file name prefix for the converted JSON files. Defaults to './data_chunk'.", + help="The cache path and file name prefix for the converted JSON files. " + + "Defaults to './data_chunk'.", ) parser.add_argument( "--no_combine", diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8361e06 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 100 +max-complexity = 25 +extend-ignore = + # See https://github.com/PyCQA/pycodestyle/issues/373 + E203, diff --git a/src/abstractive.py b/src/abstractive.py index 99ab0ab..0d7fd8e 100644 --- a/src/abstractive.py +++ b/src/abstractive.py @@ -1,42 +1,42 @@ -import os -import sys +import itertools import logging +import os import random -import torch -import datasets as nlp +import sys +from argparse import ArgumentParser +from collections import OrderedDict +from functools import partial +from time import time + +import numpy as np import pyarrow -import itertools +import pytorch_lightning as pl import spacy -import numpy as np +import torch +from rouge_score import rouge_scorer, scoring from spacy.lang.en import English -from functools import partial -from time import time -from collections import OrderedDict -from argparse import ArgumentParser from torch import nn -from rouge_score import rouge_scorer, scoring from torch.utils.data import DataLoader -import pytorch_lightning as pl -from transformers import ( - AutoTokenizer, - EncoderDecoderModel, - AutoModelForSeq2SeqLM, -) +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, EncoderDecoderModel + +import datasets as nlp +from convert_to_extractive import tokenize from helpers import ( - pad, LabelSmoothingLoss, SortishSampler, + generic_configure_optimizers, + pad, pad_tensors, test_rouge, - generic_configure_optimizers, ) -from convert_to_extractive import tokenize logger = logging.getLogger(__name__) def trim_batch( - input_ids, pad_token_id, attention_mask=None, + input_ids, + pad_token_id, + attention_mask=None, ): """Remove columns that are populated exclusively by ``pad_token_id``.""" keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) @@ -74,7 +74,8 @@ def longformer_modifier(final_dictionary, tokenizer, attention_window): for key, item in final_dictionary.items(): final_dictionary[key] = pad_tensors( - item, nearest_multiple_of=attention_window[0], + item, + nearest_multiple_of=attention_window[0], ) return final_dictionary @@ -213,8 +214,8 @@ def forward( if you are unsure what a forward function is. Args: - source (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, optional): Indices - of input sequence tokens in the vocabulary for the encoder. + source (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, optional): + Indices of input sequence tokens in the vocabulary for the encoder. `What are input IDs? `_ Defaults to None. target (``torch.LongTensor`` of shape ``(batch_size, target_sequence_length)``, optional): Provide @@ -235,16 +236,17 @@ def forward( tuple: (cross_entropy_loss, prediction_scores) The cross entropy loss and the prediction scores, which are the scores for each token in the vocabulary for each token in the output. - """ + """ # noqa: E501 # `self.model.forward()` returns `decoder_outputs + encoder_outputs` where # `decoder_outputs` and `encoder_outputs` are dictionaries. + # `labels` is None here so that `huggingface/transformers` does not calculate loss outputs = self.model.forward( input_ids=source.contiguous(), attention_mask=source_mask, decoder_input_ids=target, decoder_attention_mask=target_mask, use_cache=(labels is None), - labels=None, # `labels` is None here so that `huggingface/transformers` does not calculate loss + labels=None, **kwargs ) @@ -290,7 +292,8 @@ def prepare_data(self): ) if self.hparams.no_prepare_data or all_tokenized_files_present: logger.info( - "Skipping data preparation because `--no_prepare_data` was specified or all the final tokenized data files are present." + "Skipping data preparation because `--no_prepare_data` was specified or all the " + + "final tokenized data files are present." ) if self.hparams.only_preprocess: logger.info( @@ -309,10 +312,12 @@ def convert_to_features(example_batch): article = article.strip() try: article_encoded = self.tokenizer( - article, padding="max_length", truncation=True, + article, + padding="max_length", + truncation=True, ) articles_encoded_step.append(article_encoded) - except: # skipcq: FLK-E722 + except Exception: # skipcq: FLK-E722 print("Failed to tokenize article: {}".format(article)) sys.exit(1) @@ -321,7 +326,7 @@ def convert_to_features(example_batch): first_length = len(articles_encoded_step[0]["input_ids"]) assert ( current_length == first_length - ), "The length of the current input, {}, does not match the length of the first input, {}.".format( + ), "The length of the current input, {}, does not match the length of the first input, {}.".format( # noqa: E501 current_length, first_length ) @@ -407,7 +412,9 @@ def convert_to_features(example_batch): # The articles have already been padded because they do not need the extra # `boseq` and `eoseq` tokens. highlights_input_ids = pad( - highlights_input_ids, self.tokenizer.pad_token_id, width=max_length, + highlights_input_ids, + self.tokenizer.pad_token_id, + width=max_length, ) highlights_attention_masks = pad( highlights_attention_masks, 0, width=max_length @@ -548,7 +555,8 @@ def remove_empty(batch_item): # Exit if set to only preprocess the data if self.hparams.only_preprocess: logger.info( - "Exiting because data has been pre-processed and the `--only_preprocess` option is enabled." + "Exiting because data has been pre-processed and the `--only_preprocess` option " + + "is enabled." ) sys.exit(0) @@ -689,7 +697,7 @@ def _step(self, batch): return loss def training_step(self, batch, batch_idx): # skipcq: PYL-W0613 - """Training step: `PyTorch Lightning Documentation `__""" + """Training step: `PyTorch Lightning Documentation `__""" # noqa: E501 cross_entropy_loss = self._step(batch) self.log("train_loss", cross_entropy_loss, prog_bar=True) @@ -697,7 +705,7 @@ def training_step(self, batch, batch_idx): # skipcq: PYL-W0613 return cross_entropy_loss def validation_step(self, batch, batch_idx): # skipcq: PYL-W0613 - """Validation step: `PyTorch Lightning Documentation `__""" + """Validation step: `PyTorch Lightning Documentation `__""" # noqa: E501 cross_entropy_loss = self._step(batch) self.log("val_loss", cross_entropy_loss, prog_bar=True) @@ -707,7 +715,7 @@ def test_step(self, batch, batch_idx): # skipcq: PYL-W0613 Similar to :meth:`~abstractive.AbstractiveSummarizer.validation_step` in that in runs the inputs through the model. However, this method also calculates the ROUGE scores for each example-summary pair. - """ + """ # noqa: E501 source_ids, target_ids, source_mask, _ = ( batch["source"], batch["target"], @@ -792,7 +800,7 @@ def test_epoch_end(self, outputs): """ Called at the end of a testing epoch: `PyTorch Lightning Documentation `__ Finds the mean of all the metrics logged by :meth:`~abstractive.AbstractiveSummarizer.test_step`. - """ + """ # noqa: E501 avg_generation_time = np.array([x["generation_time"] for x in outputs]).mean() rouge_scores_log = {} @@ -812,7 +820,8 @@ def test_epoch_end(self, outputs): # and values that are `AggregateScore` objects. Each `AggregateScore` object is a # named tuple with a low, mid, and high value. Each value is a `Score` object, which # is also a named tuple, that contains the precision, recall, and fmeasure values. - # For more info see the source code: https://github.com/google-research/google-research/blob/master/rouge/scoring.py + # For more info see the source code: + # https://github.com/google-research/google-research/blob/master/rouge/scoring.py rouge_result = aggregator.aggregate() for metric, value in rouge_result.items(): @@ -938,7 +947,9 @@ def ids_to_clean_text(self, generated_ids, replace_sep_with_q=False): ) gen_texts = self.tokenizer.batch_decode( - generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True, + generated_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=True, ) if len(gen_texts) == 1: @@ -966,13 +977,19 @@ def add_model_specific_args(parent_parser): "--model_name_or_path", type=str, default="bert-base-uncased", - help="Path to pre-trained model or shortcut name. A list of shortcut names can be found at https://huggingface.co/transformers/pretrained_models.html. Community-uploaded models are located at https://huggingface.co/models. Default is 'bert-base-uncased'.", + help="Path to pre-trained model or shortcut name. A list of shortcut names can " + + "be found at https://huggingface.co/transformers/pretrained_models.html. " + + "Community-uploaded models are located at https://huggingface.co/models. " + + "Default is 'bert-base-uncased'.", ) parser.add_argument( "--decoder_model_name_or_path", type=str, default=None, - help="Path to pre-trained model or shortcut name to use as the decoder if an EncoderDecoderModel architecture is desired. If this option is not specified, the shortcut name specified by `--model_name_or_path` is loaded using the Seq2seq AutoModel. Default is 'bert-base-uncased'.", + help="Path to pre-trained model or shortcut name to use as the decoder if an " + + "EncoderDecoderModel architecture is desired. If this option is not specified, " + + "the shortcut name specified by `--model_name_or_path` is loaded using the " + + "Seq2seq AutoModel. Default is 'bert-base-uncased'.", ) parser.add_argument( "--batch_size", @@ -984,19 +1001,23 @@ def add_model_specific_args(parent_parser): "--val_batch_size", default=None, type=int, - help="Batch size per GPU/CPU for evaluation. This option overwrites `--batch_size` for evaluation only.", + help="Batch size per GPU/CPU for evaluation. This option overwrites `--batch_size` " + + "for evaluation only.", ) parser.add_argument( "--test_batch_size", default=None, type=int, - help="Batch size per GPU/CPU for testing. This option overwrites `--batch_size` for testing only.", + help="Batch size per GPU/CPU for testing. This option overwrites `--batch_size` for " + + "testing only.", ) parser.add_argument( "--dataloader_num_workers", default=3, type=int, - help="The number of workers to use when loading data. A general place to start is to set num_workers equal to the number of CPUs on your machine. More details here: https://pytorch-lightning.readthedocs.io/en/latest/performance.html#num-workers", + help="The number of workers to use when loading data. A general place to start is " + + "to set num_workers equal to the number of CPUs on your machine. " + + "More details here: https://pytorch-lightning.readthedocs.io/en/latest/performance.html#num-workers", # noqa: E501 ) parser.add_argument( "--only_preprocess", @@ -1012,7 +1033,10 @@ def add_model_specific_args(parent_parser): "--dataset", nargs="+", default="cnn_dailymail", - help="The dataset name from the `nlp` library or a list of paths to Apache Arrow files (that can be loaded with `nlp`) in the order train, validation, test to use for training/evaluation/testing. Paths must contain a '/' to be interpreted correctly. Default is `cnn_dailymail`.", + help="The dataset name from the `nlp` library or a list of paths to Apache Arrow " + + "files (that can be loaded with `nlp`) in the order train, validation, test to " + + "use for training/evaluation/testing. Paths must contain a '/' to be interpreted " + + "correctly. Default is `cnn_dailymail`.", ) parser.add_argument( "--dataset_version", @@ -1024,13 +1048,15 @@ def add_model_specific_args(parent_parser): "--data_example_column", type=str, default="article", - help="The column of the `nlp` dataset that contains the text to be summarized. Default value is for the `cnn_dailymail` dataset.", + help="The column of the `nlp` dataset that contains the text to be summarized. " + + "Default value is for the `cnn_dailymail` dataset.", ) parser.add_argument( "--data_summarized_column", type=str, default="highlights", - help="The column of the `nlp` dataset that contains the summarized text. Default value is for the `cnn_dailymail` dataset.", + help="The column of the `nlp` dataset that contains the summarized text. " + + "Default value is for the `cnn_dailymail` dataset.", ) parser.add_argument( "--cache_file_path", @@ -1042,25 +1068,27 @@ def add_model_specific_args(parent_parser): "--split_char", type=str, default=None, - help="""If the `--data_summarized_column` is already split into sentences then use - this option to specify which token marks sentence boundaries. If the summaries are - not split into sentences then spacy will be used to split them. The default is None, + help="""If the `--data_summarized_column` is already split into sentences then use + this option to specify which token marks sentence boundaries. If the summaries are + not split into sentences then spacy will be used to split them. The default is None, which means to use spacy.""", ) parser.add_argument( "--use_percentage_of_data", type=float, default=False, - help="When filtering the dataset, only save a percentage of the data. This is useful for debugging when you don't want to process the entire dataset.", + help="When filtering the dataset, only save a percentage of the data. This is " + + "useful for debugging when you don't want to process the entire dataset.", ) parser.add_argument( "--save_percentage", type=float, default=0.01, - help="""Percentage (divided by batch_size) between 0 and 1 of the predicted and target - summaries from the test set to save to disk during testing. This depends on batch - size: one item from each batch is saved `--save_percentage` percent of the time. - Thus, you can expect `len(dataset)*save_percentage/batch_size` summaries to be saved.""", + help="""Percentage (divided by batch_size) between 0 and 1 of the predicted and target + summaries from the test set to save to disk during testing. This depends on batch + size: one item from each batch is saved `--save_percentage` percent of the time. + Thus, you can expect `len(dataset)*save_percentage/batch_size` summaries to be + saved.""", ) parser.add_argument( "--save_hg_transformer", @@ -1070,41 +1098,46 @@ def add_model_specific_args(parent_parser): parser.add_argument( "--test_use_pyrouge", action="store_true", - help="""Use `pyrouge`, which is an interface to the official ROUGE software, instead of - the pure-python implementation provided by `rouge-score`. You must have the real ROUGE - package installed. More details about ROUGE 1.5.5 here: https://github.com/andersjo/pyrouge/tree/master/tools/ROUGE-1.5.5. + help="""Use `pyrouge`, which is an interface to the official ROUGE software, instead of + the pure-python implementation provided by `rouge-score`. You must have the real ROUGE + package installed. More details about ROUGE 1.5.5 here: https://github.com/andersjo/pyrouge/tree/master/tools/ROUGE-1.5.5. It is recommended to use this option for official scores. The `ROUGE-L` measurements - from `pyrouge` are equivalent to the `rougeLsum` measurements from the default - `rouge-score` package.""", + from `pyrouge` are equivalent to the `rougeLsum` measurements from the default + `rouge-score` package.""", # noqa: E501 ) parser.add_argument( "--sentencizer", action="store_true", - help="Use a spacy sentencizer instead of a statistical model for sentence detection (much faster but less accurate) during data preprocessing; see https://spacy.io/api/sentencizer.", + help="Use a spacy sentencizer instead of a statistical model for sentence " + + "detection (much faster but less accurate) during data preprocessing; see " + + "https://spacy.io/api/sentencizer.", ) parser.add_argument( "--model_max_length", type=int, default=None, - help="Changes the `model_max_length` attribute of the tokenizer. Overrides the default length of input sequences generated during data processing.", + help="Changes the `model_max_length` attribute of the tokenizer. Overrides the " + + "default length of input sequences generated during data processing.", ) parser.add_argument( "--gen_max_len", type=int, default=None, - help="Maximum sequence length during generation while testing and when using the `predict()` function.", + help="Maximum sequence length during generation while testing and when using the " + + "`predict()` function.", ) parser.add_argument( "--label_smoothing", type=float, default=0.1, - help="`LabelSmoothingLoss` implementation from OpenNMT (https://bit.ly/2ObgVPP) as stated in the original paper https://arxiv.org/abs/1512.00567.", + help="`LabelSmoothingLoss` implementation from OpenNMT (https://bit.ly/2ObgVPP) as " + + "stated in the original paper https://arxiv.org/abs/1512.00567.", ) parser.add_argument( "--sortish_sampler", action="store_true", - help="""Reorganize the input_ids by length with a bit of randomness. This can help - to avoid memory errors caused by large batches by forcing large batches to be + help="""Reorganize the input_ids by length with a bit of randomness. This can help + to avoid memory errors caused by large batches by forcing large batches to be processed first.""", ) parser.add_argument( @@ -1116,7 +1149,11 @@ def add_model_specific_args(parent_parser): parser.add_argument( "--tie_encoder_decoder", action="store_true", - help="Tie the encoder and decoder weights. Only takes effect when using an EncoderDecoderModel architecture with the `--decoder_model_name_or_path` option. Specifying this option is equivalent to the 'share' architecture tested in 'Leveraging Pre-trained Checkpoints for Sequence Generation Tasks' (https://arxiv.org/abs/1907.12461).", + help="Tie the encoder and decoder weights. Only takes effect when using an " + + "EncoderDecoderModel architecture with the `--decoder_model_name_or_path` " + + "option. Specifying this option is equivalent to the 'share' architecture " + + "tested in 'Leveraging Pre-trained Checkpoints for Sequence Generation Tasks' " + + "(https://arxiv.org/abs/1907.12461).", ) return parser diff --git a/src/classifier.py b/src/classifier.py index 1106e2a..6141ee4 100644 --- a/src/classifier.py +++ b/src/classifier.py @@ -1,7 +1,8 @@ -import sys import logging -from packaging import version +import sys + import torch +from packaging import version from torch import nn logger = logging.getLogger(__name__) @@ -10,7 +11,8 @@ from transformers.activations import get_activation except ImportError: logger.warning( - "Could not import `get_activation` from `transformers.activations`. Only GELU will be available for use in the classifier." + "Could not import `get_activation` from `transformers.activations`. Only GELU will be " + + "available for use in the classifier." ) @@ -101,11 +103,12 @@ class TransformerEncoderClassifier(nn.Module): Arguments: d_model (int): The number of expected features in the input nhead (int, optional): The number of heads in the multiheadattention models. Default is 8. - dim_feedforward (int, optional): The dimension of the feedforward network model. Default is 2048. + dim_feedforward (int, optional): The dimension of the feedforward network model. + Default is 2048. dropout (float, optional): The dropout value. Default is 0.1. num_layers (int, optional): The number of ``TransformerEncoderLayer``\ s. Default is 2. - reduction (nn.Module, optional): a nn.Module that maps `d_model` inputs to 1 value; if not specified - then a ``nn.Sequential()`` module consisting of a linear layer and a + reduction (nn.Module, optional): a nn.Module that maps `d_model` inputs to 1 value; if not + specified then a ``nn.Sequential()`` module consisting of a linear layer and a sigmoid will automatically be created. Default is ``nn.Sequential(linear, sigmoid)``. """ @@ -122,7 +125,8 @@ def __init__( if version.parse(torch.__version__) < version.parse("1.5.0"): logger.error( - "You have PyTorch version %s installed, but `TransformerEncoderClassifier` requires at least version 1.5.0.", + "You have PyTorch version %s installed, but `TransformerEncoderClassifier` " + + "requires at least version 1.5.0.", torch.__version__, ) sys.exit(1) @@ -150,19 +154,22 @@ def forward(self, x, mask): """ # add dimension in the middle attn_mask = mask.unsqueeze(1) - # expand the middle dimension to the same size as the last dimension (the number of sentences/source length) - # Example with batch size 2: There are two masks since there are two sequences in the batch. Each mask - # is a list of booleans for each sentence vector. The below line expands each of these lists by duplicating - # them until they are each as long as the number of sentences. Now instead of a list of booleans, each mask - # is a matrix where each row is identical. This effectively masks tokens where the entire column is False. - # Slight Explanation (for 2D not 3D): https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3 - # Detailed Explanation for Beginners: https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%2C%20Inference%20and%20BLEU.ipynb - # PyTorch MultiheadAttention Docs: https://pytorch.org/docs/stable/nn.html#torch.nn.MultiheadAttention.forward + # expand the middle dimension to the same size as the last dimension (the number of + # sentences/source length) + # Example with batch size 2: There are two masks since there are two sequences in the + # batch. Each mask is a list of booleans for each sentence vector. The below line expands + # each of these lists by duplicating them until they are each as long as the number of + # sentences. Now instead of a list of booleans, each mask is a matrix where each row is + # identical. This effectively masks tokens where the entire column is False. + # Slight Explanation (for 2D not 3D): https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3 # noqa: E501 + # Detailed Explanation for Beginners: https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%2C%20Inference%20and%20BLEU.ipynb # noqa: E501 + # PyTorch MultiheadAttention Docs: https://pytorch.org/docs/stable/nn.html#torch.nn.MultiheadAttention.forward # noqa: E501 attn_mask = attn_mask.expand(-1, attn_mask.size(2), -1) # repeat the mask for each attention head attn_mask = attn_mask.repeat(self.nhead, 1, 1) # attn_mask is shape (batch size*num_heads, target sequence length, source sequence length) - # set all the 0's (False) to negative infinity and the 1's (True) to 0.0 because the attn_mask is additive + # set all the 0's (False) to negative infinity and the 1's (True) to 0.0 because the + # attn_mask is additive attn_mask = ( attn_mask.float() .masked_fill(attn_mask == 0, float("-inf")) diff --git a/src/convert_to_extractive.py b/src/convert_to_extractive.py index 782e2c3..7f24446 100644 --- a/src/convert_to_extractive.py +++ b/src/convert_to_extractive.py @@ -1,22 +1,24 @@ -import os -import re import gc -import sys -import json -import gzip import glob -import math +import gzip import itertools +import json import logging -import spacy -import datasets as hf_nlp -from spacy.lang.en import English +import math +import os +import re +import sys from argparse import ArgumentParser from functools import partial from multiprocessing import Pool -from tqdm import tqdm from time import time -from helpers import load_json, _get_word_ngrams + +import spacy +from spacy.lang.en import English +from tqdm import tqdm + +import datasets as hf_nlp +from helpers import _get_word_ngrams, load_json logger = logging.getLogger(__name__) @@ -29,11 +31,13 @@ # Run source and target through oracle_id algorithm # Run current preprocess_examples() function (data cleaning) in data processor # Return source (as list of sentences) and target -# In map() loop: append each (source, target, labels) to variable and save (as cnn_dm_extractive) once done +# In map() loop: append each (source, target, labels) to variable and save (as +# cnn_dm_extractive) once done # BertSum: # 1. Tokenize all files into tokenized json versions -# 2. Split json into source and target AND concat stories into chunks of `shard_size` number of stories +# 2. Split json into source and target AND concat stories into chunks of `shard_size` +# number of stories # 3. Process to obtain extractive summary and labels for each shard # 4. Save each processed shard as list of dictionaries with processed values @@ -66,8 +70,9 @@ def read_in_chunks(file_object, chunk_size=5000): def convert_to_extractive_driver(args): """ Driver function to convert an abstractive summarization dataset to an extractive dataset. - The abstractive dataset must be formatted with two files for each split: a source and target file. - Example file list for two splits: ``["train.source", "train.target", "val.source", "val.target"]`` + The abstractive dataset must be formatted with two files for each split: a source and target + file. Example file list for two splits: + ``["train.source", "train.target", "val.source", "val.target"]`` """ # default is to output to input data directory if no output directory specified if not args.base_output_path: @@ -111,7 +116,7 @@ def convert_to_extractive_driver(args): if args.dataset: target_file_len = len(current_dataset) else: - target_file_len = sum([1 for line in target_file]) + target_file_len = sum(1 for line in target_file) # reset pointer back to beginning after getting length target_file.seek(0) @@ -230,7 +235,8 @@ def convert_to_extractive_process( logger.info("Processing %s", name) t0 = time() for (preprocessed_data, target_doc) in pool.map( - _example_processor, zip(source_docs_tokenized, target_docs_tokenized), + _example_processor, + zip(source_docs_tokenized, target_docs_tokenized), ): if preprocessed_data is not None: # preprocessed_data is (source_doc, labels) @@ -277,7 +283,8 @@ def resume(output_path, split, chunk_size): return None # get the first match because and convert to int so max() operator works - # more info about the below RegEx: https://stackoverflow.com/a/1454936 (https://web.archive.org/web/20200701145857/https://stackoverflow.com/questions/1454913/regular-expression-to-find-a-string-included-between-two-characters-while-exclud/1454936) + # more info about the below RegEx: https://stackoverflow.com/a/1454936 + # (https://web.archive.org/web/20200701145857/https://stackoverflow.com/questions/1454913/regular-expression-to-find-a-string-included-between-two-characters-while-exclud/1454936) # noqa: E501 shard_file_idxs = [ int(re.search(r"(?<=\.)(.*?)(?=\.)", a).group(1)) for a in all_json_in_split ] @@ -336,12 +343,16 @@ def check_resume_success( logger.info("Previous (to resume line) Source Line: %s", preprocessed_line) # skipcq: PYL-W1201 logger.info( - ( - "Common causes of this issue:\n" - + "1. You changed the `--shard_interval`. You used a different interval previously than you used in the command to resume.\n" - + "2. The abstractive (`.source` and `.target`) or extractive (`.json`) dataset files were modified or removed. The last `.json` file needs to be in the same folder it was originally outputted to so the last shard index and be determined and the last line can be read.\n" - + "3. It is entirely possible that there is a bug in this script. If you have checked that the above were not the cause and that there were no issues pertaining to your dataset then open an issue at https://github.com/HHousen/TransformerSum/issues/new." - ) + "Common causes of this issue:\n" + + "1. You changed the `--shard_interval`. You used a different interval previously " + + "than you used in the command to resume.\n" + + "2. The abstractive (`.source` and `.target`) or extractive (`.json`) dataset " + + "files were modified or removed. The last `.json` file needs to be in the same " + + "folder it was originally outputted to so the last shard index and be determined " + + "and the last line can be read.\n" + + "3. It is entirely possible that there is a bug in this script. If you have checked " + + "that the above were not the cause and that there were no issues pertaining to your " + + "dataset then open an issue at https://github.com/HHousen/TransformerSum/issues/new." ) return False @@ -395,7 +406,11 @@ def tokenize( tokenized = [] for doc in tqdm( - nlp.pipe(docs, n_process=n_process, batch_size=batch_size,), + nlp.pipe( + docs, + n_process=n_process, + batch_size=batch_size, + ), total=len(docs), desc="Tokenizing" + name, mininterval=tokenizer_log_interval, @@ -416,7 +431,8 @@ def tokenize( del doc_sents logger.debug("Done in %.2f seconds", time() - t0) - # `sents` is an array of documents where each document is an array sentences where each sentence is an array of tokens + # `sents` is an array of documents where each document is an array sentences where each + # sentence is an array of tokens return sents @@ -433,8 +449,8 @@ def example_processor(inputs, args, oracle_mode="greedy", no_preprocess=False): # `oracle_ids` to labels labels = [0] * len(source_doc) - for l in oracle_ids: - labels[l] = 1 + for l_id in oracle_ids: + labels[l_id] = 1 # The number of sentences in the source document should equal the number of labels. # There should be one label per sentence. @@ -443,7 +459,9 @@ def example_processor(inputs, args, oracle_mode="greedy", no_preprocess=False): + str(source_doc) + "\nLabels: " + str(labels) - + "\n^^ The above document and label combination are not equal in length. The cause of this problem in not known. This check exists to prevent further problems down the data processing pipeline." + + "\n^^ The above document and label combination are not equal in length. The cause of " + + "this problem in not known. This check exists to prevent further problems down the " + + "data processing pipeline." ) if no_preprocess: @@ -491,7 +509,7 @@ def preprocess( # Section Methods (to convert abstractive summary to extractive) -# Copied from https://github.com/nlpyang/BertSum/blob/9aa6ab84faf3a50724ce7112c780a4651de289b0/src/prepro/data_builder.py +# Copied from https://github.com/nlpyang/BertSum/blob/9aa6ab84faf3a50724ce7112c780a4651de289b0/src/prepro/data_builder.py # noqa: E501 def combination_selection(doc_sent_list, abstract_sent_list, summary_size): def _rouge_clean(s): return re.sub(r"[^a-zA-Z0-9 ]", "", s) @@ -650,7 +668,9 @@ def cal_rouge(evaluated_ngrams, reference_ngrams): help="use gzip compression when saving data", ) parser.add_argument( - "--resume", action="store_true", help="resume from last shard", + "--resume", + action="store_true", + help="resume from last shard", ) parser.add_argument( "--tokenizer_log_interval", @@ -661,12 +681,14 @@ def cal_rouge(evaluated_ngrams, reference_ngrams): parser.add_argument( "--sentencizer", action="store_true", - help="use a spacy sentencizer instead of a statistical model for sentence detection (much faster but less accurate); see https://spacy.io/api/sentencizer", + help="use a spacy sentencizer instead of a statistical model for sentence " + + "detection (much faster but less accurate); see https://spacy.io/api/sentencizer", ) parser.add_argument( "--no_preprocess", action="store_true", - help="do not run the preprocess function, which removes sentences that are too long/short and examples that have too few/many sentences", + help="do not run the preprocess function, which removes sentences that are too " + + "long/short and examples that have too few/many sentences", ) parser.add_argument( "--min_sentence_ntokens", @@ -696,7 +718,8 @@ def cal_rouge(evaluated_ngrams, reference_ngrams): "--dataset", type=str, default=None, - help="The dataset name from the `nlp` library to use for training/evaluation/testing. Default is None.", + help="The dataset name from the `nlp` library to use for training/evaluation/testing. " + + "Default is None.", ) parser.add_argument( "--dataset_version", @@ -708,7 +731,8 @@ def cal_rouge(evaluated_ngrams, reference_ngrams): "--data_example_column", type=str, default=None, - help="The column of the `nlp` dataset that contains the text to be summarized. Default is None.", + help="The column of the `nlp` dataset that contains the text to be summarized. " + + "Default is None.", ) parser.add_argument( "--data_summarized_column", @@ -728,7 +752,9 @@ def cal_rouge(evaluated_ngrams, reference_ngrams): if main_args.resume and not main_args.shard_interval: parser.error( - "Resuming requires both shard mode (--shard_interval) to be enabled and shards to be created. Must use same 'shard_interval' that was used previously to create the files to be resumed from." + "Resuming requires both shard mode (--shard_interval) to be enabled and " + + "shards to be created. Must use same 'shard_interval' that was used " + + "previously to create the files to be resumed from." ) # The `nlp` library has specific names for the dataset split names so set them diff --git a/src/data.py b/src/data.py index e2ccd51..55edc82 100644 --- a/src/data.py +++ b/src/data.py @@ -1,14 +1,16 @@ -import os -import gc import copy +import gc import json -import random -import logging import linecache -import torch -import numpy as np -from multiprocessing import Pool +import logging +import os +import random from functools import partial +from multiprocessing import Pool + +import numpy as np +import torch + from helpers import pad logger = logging.getLogger(__name__) @@ -54,7 +56,7 @@ def pad_batch_collate(batch, modifier=None): ``modifier`` function will be called directly before ``final_dictionary`` is returned in :meth:`~data.pad_batch_collate`. This allows for easy extendability. - """ + """ # noqa: E501 elem = batch[0] final_dictionary = {} @@ -82,7 +84,7 @@ def pad_batch_collate(batch, modifier=None): # tokens are attended to. attention_mask = [[1] * len(ids) for ids in input_ids] - input_ids_width = max([len(ids) for ids in input_ids]) + input_ids_width = max(len(ids) for ids in input_ids) input_ids = pad(input_ids, 0, width=input_ids_width) input_ids = torch.tensor(input_ids) attention_mask = pad(attention_mask, 0) @@ -91,7 +93,7 @@ def pad_batch_collate(batch, modifier=None): if "sent_lengths" in elem: sent_lengths = [] sent_lengths_mask = [] - sent_lengths_width = max([len(d["sent_lengths"]) + 1 for d in batch]) + sent_lengths_width = max(len(d["sent_lengths"]) + 1 for d in batch) for d in batch: current_sent_lens = d["sent_lengths"] current_sent_lengths_mask = [True] * len(current_sent_lens) @@ -103,9 +105,10 @@ def pad_batch_collate(batch, modifier=None): current_sent_lens.append(0) current_sent_lengths_mask.append(False) # if a value needs to be added to make `sum(current_sent_lens)` the total input - # sequence length OR there is one more number to add (this can happen if the input - # sequence exactly ends with a sentence, making the total of the lengths the length - # of the sequence, or if there is one sentence that takes up the entire sequence) + # sequence length OR there is one more number to add (this can happen if the + # input sequence exactly ends with a sentence, making the total of the lengths + # the length of the sequence, or if there is one sentence that takes up the + # entire sequence) if total_value_to_add > 0 or num_to_add == 1: current_sent_lens.append(total_value_to_add) current_sent_lengths_mask.append(False) @@ -179,7 +182,7 @@ def __getitem__(self, index): line_str = linecache.getline(file_path, linecache_index).rstrip("\n") try: line_json = json.loads(line_str) - except: + except Exception: print("** JSON Loading Error **") print(file_path) print(index) @@ -254,7 +257,8 @@ def __len__(self): return self.total_length logger.debug( - "Calculating length of `IterableDataset` by loading each file, getting the length, and unloading, which is slow." + "Calculating length of `IterableDataset` by loading each file, getting the length, " + + "and unloading, which is slow." ) total_length = 0 for data_file in self.files_list: @@ -364,13 +368,15 @@ class SentencesProcessor: r"""Create a `SentencesProcessor` Arguments: - name (str, optional): A label for the ``SentencesProcessor`` object, used internally for saving if - a save name is not specified in :meth:`data.SentencesProcessor.get_features`, Default is None. + name (str, optional): A label for the ``SentencesProcessor`` object, used internally for + saving if a save name is not specified in :meth:`data.SentencesProcessor.get_features`, + Default is None. labels (list, optional): The label that goes with each sample, can be a list of lists where the inside lists are the labels for each sentence in the coresponding example. Default is None. examples (list, optional): List of ``InputExample``\ s. Default is None. - verbose (bool, optional): Log extra information (such as examples of processed data points). Default is False. + verbose (bool, optional): Log extra information (such as examples of processed data + points). Default is False. """ def __init__(self, name=None, labels=None, examples=None, verbose=False): @@ -416,7 +422,8 @@ def get_input_ids( if max_length > 1_000_000: logger.warning( - "Tokenizer maximum length is greater than 1,000,000. This is likely a mistake. Resetting to 512 tokens." + "Tokenizer maximum length is greater than 1,000,000. This is likely a mistake. " + + "Resetting to 512 tokens." ) max_length = 512 @@ -431,7 +438,8 @@ def get_input_ids( ] if not len(src_txt) < 2: # if there is NOT 1 sentence - # separate each sentence with ' [SEP] [CLS] ' (or model equivalent tokens) and convert to string + # separate each sentence with ' [SEP] [CLS] ' (or model equivalent tokens) and + # convert to string separation_string = " " + sep_token + " " + cls_token + " " text = separation_string.join(src_txt) else: @@ -473,18 +481,24 @@ def add_examples( Arguments: texts (list): A list of documents where each document is a list of sentences where each - sentence is a list of tokens. This is the output of `convert_to_extractive.py` - and is in the 'src' field for each doc. See :meth:`extractive.ExtractiveSummarizer.prepare_data`. - labels (list, optional): A list of the labels for each document where each label is a list of labels - where the index of the label coresponds with the index of the sentence in the - respective entry in `texts.` Similarly to `texts`, this is handled automatically - by `ExtractiveSummarizer.prepare_data`. Default is None. - ids (list, optional): A list of ids for each document. Not used by `ExtractiveSummarizer`. Default is None. - oracle_ids (list, optional): Similar to labels but is a list of indexes of the chosen sentences - instead of a one-hot encoded vector. These will be converted to labels. Default is None. - targets (list, optional): A list of the abstractive target for each document. Default is None. - overwrite_labels (bool, optional): Replace any labels currently stored by the ``SentencesProcessor``. Default is False. - overwrite_examples (bool, optional): Replace any examples currently stored by the ``SentencesProcessor``. Default is False. + sentence is a list of tokens. This is the output of `convert_to_extractive.py` + and is in the 'src' field for each doc. See + :meth:`extractive.ExtractiveSummarizer.prepare_data`. + labels (list, optional): A list of the labels for each document where each label is a + list of labels where the index of the label coresponds with the index of the + sentence in the respective entry in `texts.` Similarly to `texts`, this is handled + automatically by `ExtractiveSummarizer.prepare_data`. Default is None. + ids (list, optional): A list of ids for each document. Not used by + `ExtractiveSummarizer`. Default is None. + oracle_ids (list, optional): Similar to labels but is a list of indexes of the chosen + sentences instead of a one-hot encoded vector. These will be converted to labels. + Default is None. + targets (list, optional): A list of the abstractive target for each document. + Default is None. + overwrite_labels (bool, optional): Replace any labels currently stored by the + ``SentencesProcessor``. Default is False. + overwrite_examples (bool, optional): Replace any examples currently stored by the + ``SentencesProcessor``. Default is False. Returns: list: The examples as ``InputExample``\ s that have been added. @@ -502,8 +516,8 @@ def add_examples( labels = [] for text_set, oracle_id in zip(texts, oracle_ids): text_label = [0] * len(text_set) - for l in oracle_id: - text_label[l] = 1 + for l_id in oracle_id: + text_label[l_id] = 1 labels.append(text_label) else: labels = [None] * len(texts) @@ -558,9 +572,10 @@ def get_features_process( pad_ids_and_attention=True, ): """ - The process that actually creates the features. :meth:`~data.SentencesProcessor.get_features` - is the driving function, look there for a description of how this function works. This - function only exists so that processing can easily be done in parallel using ``Pool.map``. + The process that actually creates the features. + :meth:`~data.SentencesProcessor.get_features` is the driving function, look there for a + description of how this function works. This function only exists so that processing can + easily be done in parallel using ``Pool.map``. """ ex_index, example, label = input_information if ex_index % 1000 == 0: @@ -721,28 +736,30 @@ def get_features( save_to_name=None, save_as_type="txt", ): - r"""Convert the examples stored by the ``SentencesProcessor`` to features that can be used by - a model. The following processes can be performed: tokenization, token type ids (to separate - sentences), sentence representation token ids (the locations of each sentence representation - token), sentence lengths, and the attention mask. Padding can be applied to the tokenized - examples and the attention masks but it is recommended to instead use the + r"""Convert the examples stored by the ``SentencesProcessor`` to features that can be used + by a model. The following processes can be performed: tokenization, token type ids (to + separate sentences), sentence representation token ids (the locations of each sentence + representation token), sentence lengths, and the attention mask. Padding can be applied to + the tokenized examples and the attention masks but it is recommended to instead use the :meth:`data.pad_batch_collate` function so each batch is padded individually for efficiency (less zeros passed through model). Arguments: - tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the examples. - bert_compatible_cls (bool, optional): Adds '[CLS]' tokens in front of each sentence. This is useful - so that the '[CLS]' token can be used to obtain sentence + tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the + examples. + bert_compatible_cls (bool, optional): Adds '[CLS]' tokens in front of each sentence. + This is useful so that the '[CLS]' token can be used to obtain sentence embeddings. This only works if the chosen model has the '[CLS]' token in its vocabulary. Default is True. - create_sent_rep_token_ids (bool, optional): Option to create sentence representation token ids. This - will store a list of the indexes of all the ``sent_rep_token_id``\ s - in the tokenized example. Default is True. - sent_rep_token_id ([type], optional): The token id that should be captured for each sentence (should have - one per sentence and each should represent that sentence). + create_sent_rep_token_ids (bool, optional): Option to create sentence representation + token ids. This will store a list of the indexes of all the + ``sent_rep_token_id``\ s in the tokenized example. Default is True. + sent_rep_token_id ([type], optional): The token id that should be captured for each + sentence (should have one per sentence and each should represent that sentence). Default is ``'[CLS]' token if bert_compatible_cls else '[SEP]' token``. - create_sent_lengths (bool, optional): Option to create a list of sentence lengths where each index in - the list coresponds to the respective sentence in the example. Default is True. + create_sent_lengths (bool, optional): Option to create a list of sentence lengths where + each index in the list coresponds to the respective sentence in the example. + Default is True. create_segment_ids (str, optional): Option to create segment ids (aka token type ids). See https://huggingface.co/transformers/glossary.html#token-type-ids for more info. Set to either "binary", "sequential", or False. @@ -758,38 +775,47 @@ def get_features( "binary" mode exists so that these pretrained models can easily be used. Default is "binary". - segment_token_id (str, optional): The token id to be used when creating segment ids. Can be set to 'period' - to treat periods as sentence separation tokens, but this is a terrible - idea for obvious reasons. Default is '[SEP]' token id. - create_source (bool, optional): Option to save the source text (non-tokenized) as a string. Default is False. - n_process (int, optional): How many processes to use for multithreading for running get_features_process(). - Set higher to run faster and set lower is you experience OOM issues. Default is 2. - max_length (int, optional): If ``pad_ids_and_attention`` is True then pad to this amount. Default is ``tokenizer.max_len``. - pad_on_left (bool, optional): Optionally, pad on the left instead of right. Default is False. - pad_token (int, optional): Which token to use for padding the ``input_ids``. Default is 0. - mask_padding_with_zero (bool, optional): Use zeros to pad the attention. Uses ones otherwise. Default is True. - create_attention_mask (bool, optional): Option to create the attention mask. It is recommended to use - the :meth:`data.pad_batch_collate` function, which will automatically create - attention masks and pad them on a per batch level. Default is ``False if return_type == "lists" else True``. - pad_ids_and_attention (bool, optional): Pad the ``input_ids`` with ``pad_token`` and attention masks - with 0s or 1s deneding on ``mask_padding_with_zero``. Pad both to + segment_token_id (str, optional): The token id to be used when creating segment ids. + Can be set to 'period' to treat periods as sentence separation tokens, but this + is a terrible idea for obvious reasons. Default is '[SEP]' token id. + create_source (bool, optional): Option to save the source text (non-tokenized) as a + string. Default is False. + n_process (int, optional): How many processes to use for multithreading for running + get_features_process(). Set higher to run faster and set lower is you experience + OOM issues. Default is 2. + max_length (int, optional): If ``pad_ids_and_attention`` is True then pad to this + amount. Default is ``tokenizer.max_len``. + pad_on_left (bool, optional): Optionally, pad on the left instead of right. Default + is False. + pad_token (int, optional): Which token to use for padding the ``input_ids``. Default + is 0. + mask_padding_with_zero (bool, optional): Use zeros to pad the attention. Uses ones + otherwise. Default is True. + create_attention_mask (bool, optional): Option to create the attention mask. It is + recommended to use the :meth:`data.pad_batch_collate` function, which will + automatically create attention masks and pad them on a per batch level. Default + is ``False if return_type == "lists" else True``. + pad_ids_and_attention (bool, optional): Pad the ``input_ids`` with ``pad_token`` and + attention masks with 0s or 1s deneding on ``mask_padding_with_zero``. Pad both to ``max_length``. Default is ``False if return_type == "lists" else True`` - return_type (str, optional): Either "tensors", "lists", or None. See "Returns" section below. Default is None. - save_to_path (str, optional): The folder/directory to save the data to OR None to not save. - Will save the data specified by ``return_type`` to disk. Default is None. - save_to_name (str, optional): The name of the file to save. The extension '.pt' is automatically - appended. Default is ``'dataset_' + self.name + '.pt'``. - save_as_type (str, optional): The file extension of saved file if `save_to_path` is set. Supports "pt" (PyTorch) - and "txt" (Text). Saving as "txt" requires the ``return_type`` to be ``lists``. If ``return_type`` is - ``tensors`` the only ``save_as_type`` available is "pt". Defaults to "txt". + return_type (str, optional): Either "tensors", "lists", or None. See "Returns" section + below. Default is None. + save_to_path (str, optional): The folder/directory to save the data to OR None to not + save. Will save the data specified by ``return_type`` to disk. Default is None. + save_to_name (str, optional): The name of the file to save. The extension '.pt' is + automatically appended. Default is ``'dataset_' + self.name + '.pt'``. + save_as_type (str, optional): The file extension of saved file if `save_to_path` is + set. Supports "pt" (PyTorch) and "txt" (Text). Saving as "txt" requires the + ``return_type`` to be ``lists``. If ``return_type`` is ``tensors`` the only + ``save_as_type`` available is "pt". Defaults to "txt". Returns: list or torch.TensorDataset: If ``return_type is None`` return the list of calculated features. If ``return_type == "tensors"`` return the features converted to tensors and stacked such that features are grouped together into individual tensors. If ``return_type == "lists"``, which is the recommended option then exports each - ``InputFeatures`` object in the exported ``features`` list as a dictionary and appends each - dictionary to a list. Returns that list. + ``InputFeatures`` object in the exported ``features`` list as a dictionary and appends + each dictionary to a list. Returns that list. """ assert return_type in ["tensors", "lists"] or return_type is None assert save_as_type in ["txt", "pt"] or save_to_path is None diff --git a/src/extractive.py b/src/extractive.py index 98267c9..d1a3228 100644 --- a/src/extractive.py +++ b/src/extractive.py @@ -1,51 +1,42 @@ -import os -import sys import glob import logging +import os +import statistics +import sys import types +from argparse import ArgumentParser, Namespace +from collections import OrderedDict +from functools import partial from typing import List, Union -import statistics + import numpy as np -from functools import partial -from collections import OrderedDict -from argparse import ArgumentParser, Namespace import pytorch_lightning as pl -from rouge_score import rouge_scorer, scoring import torch +from rouge_score import rouge_scorer, scoring +from spacy.lang.en import English from torch import nn from torch.utils.data import DataLoader -from spacy.lang.en import English -from pooling import Pooling -from data import SentencesProcessor, FSIterableDataset, pad_batch_collate, FSDataset +from transformers import AutoConfig, AutoModel, AutoTokenizer +from transformers.data.metrics import acc_and_f1 + from classifier import ( LinearClassifier, SimpleLinearClassifier, TransformerEncoderClassifier, ) -from helpers import ( - load_json, - block_trigrams, - test_rouge, - generic_configure_optimizers, -) +from data import FSDataset, FSIterableDataset, SentencesProcessor, pad_batch_collate +from helpers import block_trigrams, generic_configure_optimizers, load_json, test_rouge +from pooling import Pooling logger = logging.getLogger(__name__) -from transformers import ( - AutoConfig, - AutoModel, - AutoTokenizer, -) -from transformers.data.metrics import acc_and_f1 # CUSTOM_MODEL_CLASSES = ("longformer",) try: from transformers.models.auto.modeling_auto import MODEL_MAPPING - MODEL_CLASSES = tuple( - [m.model_type for m in MODEL_MAPPING] - ) # + CUSTOM_MODEL_CLASSES + MODEL_CLASSES = tuple(m.model_type for m in MODEL_MAPPING) # + CUSTOM_MODEL_CLASSES except ImportError: logger.warning( "Could not import `MODEL_MAPPING` from transformers because it is an old version." @@ -118,7 +109,10 @@ def __init__(self, hparams, embedding_model_config=None, classifier_obj=None): self.word_embedding_model = AutoModel.from_config(embedding_model_config) if ( - any(x in hparams.model_name_or_path for x in ["roberta", "distil", "longformer"]) + any( + x in hparams.model_name_or_path + for x in ["roberta", "distil", "longformer"] + ) ) and not hparams.no_use_token_type_ids: logger.warning( ( @@ -148,7 +142,8 @@ def __init__(self, hparams, embedding_model_config=None, classifier_obj=None): sent_rep_tokens=False, mean_tokens=True, max_tokens=False ) - # if a classifier object was passed when creating this model then store that as the `encoder` + # if a classifier object was passed when creating this model then store that as the + # `encoder` if classifier_obj: self.encoder = classifier_obj # otherwise create the classifier using the `hparams.classifier` parameter if available @@ -266,7 +261,7 @@ def forward( tuple: Contains the sentence scores and mask as ``torch.Tensor``\ s. The mask is either the ``sent_rep_mask`` or ``sent_lengths_mask`` depending on the pooling mode used during model initialization. - """ + """ # noqa: E501 inputs = { "input_ids": input_ids, "attention_mask": attention_mask, @@ -305,17 +300,18 @@ def compute_loss(self, outputs, labels, mask): """Compute the loss between model outputs and ground-truth labels. Args: - outputs (torch.Tensor): Output sentence scores obtained from :meth:`~extractive.ExtractiveSummarizer.forward` + outputs (torch.Tensor): Output sentence scores obtained from + :meth:`~extractive.ExtractiveSummarizer.forward` labels (torch.Tensor): Ground-truth labels (``1`` for sentences that should be in the summary, ``0`` otherwise) from the batch generated during the data preprocessing stage. mask (torch.Tensor): Mask returned by :meth:`~extractive.ExtractiveSummarizer.forward`, - either ``sent_rep_mask`` or ``sent_lengths_mask`` depending on the pooling mode used - during model initialization. + either ``sent_rep_mask`` or ``sent_lengths_mask`` depending on the pooling mode + used during model initialization. Returns: - [tuple]: Losses: (total_loss, total_norm_batch_loss, sum_avg_seq_loss, mean_avg_seq_loss, - average_loss) + [tuple]: Losses: (total_loss, total_norm_batch_loss, sum_avg_seq_loss, + mean_avg_seq_loss, average_loss) """ try: loss = self.loss_func(outputs, labels.float()) @@ -376,7 +372,12 @@ def setup(self, stage): ) def json_to_dataset( - self, tokenizer, hparams, inputs=None, num_files=0, processor=None, + self, + tokenizer, + hparams, + inputs=None, + num_files=0, + processor=None, ): """Convert json output from ``convert_to_extractive.py`` to a ".pt" file containing lists or tensors using a :class:`data.SentencesProcessor`. This function is run by @@ -450,8 +451,11 @@ def prepare_data(self): Algorithm: For each json file outputted by the ``convert_to_extractive.py`` script: 1. Load json file. - 2. Add each document in json file to ``SentencesProcessor`` defined in ``self.processor``, overwriting any previous data in the processor. - 3. Run :meth:`data.SentencesProcessor.get_features` to save the extracted features to disk as a ``.pt`` file containing a pickled python list of dictionaries, which each dictionary contains the extracted features. + 2. Add each document in json file to ``SentencesProcessor`` defined in ``self.processor``, + overwriting any previous data in the processor. + 3. Run :meth:`data.SentencesProcessor.get_features` to save the extracted features to disk + as a ``.pt`` file containing a pickled python list of dictionaries, which each + dictionary contains the extracted features. Memory Usage Note: If sharding was turned off during the ``convert_to_extractive`` process then this function will run once, loading the entire dataset into memory to process @@ -466,7 +470,8 @@ def get_inferred_data_type(dataset_files): not dataset_files_extensions_equal ) and self.hparams.data_type == "none": logger.error( - "Cannot infer data file type because files with different extensions detected. Please set `--data_type`." + "Cannot infer data file type because files with different extensions " + + "detected. Please set `--data_type`." ) sys.exit(1) @@ -480,7 +485,9 @@ def get_inferred_data_type(dataset_files): and self.hparams.data_type != "none" ): logger.warning( - "`--data_type` is '%s', but the most common file type detected in the `--data_path` is '%s'. Using '%s' as the type. Data will be processed if this type does not exist. Did you choose the correct data type?", + "`--data_type` is '%s', but the most common file type detected in the " + + "`--data_path` is '%s'. Using '%s' as the type. Data will be processed " + + "if this type does not exist. Did you choose the correct data type?", self.hparams.data_type, most_common, self.hparams.data_type, @@ -488,7 +495,8 @@ def get_inferred_data_type(dataset_files): if len(dataset_files) == 0 and self.hparams.data_type == "none": logger.error( - "Data is going to be processed, but you have not specified an output format. Set `--data_type` to the desired format." + "Data is going to be processed, but you have not specified an output format. " + + "Set `--data_type` to the desired format." ) sys.exit(1) @@ -535,7 +543,8 @@ def get_inferred_data_type(dataset_files): ) if len(json_files) == 0: logger.error( - "No JSON dataset files detected for %s split. Make sure the `--data_path` is correct.", + "No JSON dataset files detected for %s split. Make sure the `--data_path`" + + " is correct.", corpus_type, ) sys.exit(1) @@ -572,7 +581,8 @@ def remove_complete(doc): ) for _ in map( - json_to_dataset_processor, zip(range(len(json_files)), json_files), + json_to_dataset_processor, + zip(range(len(json_files)), json_files), ): pass # pool.close() @@ -587,20 +597,22 @@ def remove_complete(doc): ) ) - # if set to only preprocess the data then continue to next loop (aka next split of dataset) + # if set to only preprocess the data then continue to next loop + # (aka next split of dataset) if self.hparams.only_preprocess: continue # always create actual dataset, either after writing the shard files to disk - # or by skipping that step (because preprocessed files detected) and going right to loading. + # or by skipping that step (because preprocessed files detected) and going right to + # loading. if self.hparams.dataloader_type == "map": if inferred_data_type != "txt": logger.error( - """The `--dataloader_type` is 'map' but the `--data_type` was not - inferred to be 'txt'. The map-style dataloader requires 'txt' data. - Either set `--dataloader_type` to 'iterable' to use the old data + """The `--dataloader_type` is 'map' but the `--data_type` was not + inferred to be 'txt'. The map-style dataloader requires 'txt' data. + Either set `--dataloader_type` to 'iterable' to use the old data format or process the JSON to TXT by setting `--data_type` to - 'txt'. Alternatively, you can convert directly from PT to TXT + 'txt'. Alternatively, you can convert directly from PT to TXT using `scripts/convert_extractive_pt_to_txt.py`.""" ) sys.exit(1) @@ -616,7 +628,8 @@ def remove_complete(doc): # if set to only preprocess the data then exit after all loops have been completed if self.hparams.only_preprocess: logger.warning( - "Exiting since data has been preprocessed and written to disk and `hparams.only_preprocess` is True." + "Exiting since data has been preprocessed and written to disk " + + "and `hparams.only_preprocess` is True." ) sys.exit(0) @@ -694,7 +707,7 @@ def configure_optimizers(self): ) def training_step(self, batch, batch_idx): # skipcq: PYL-W0613 - """Training step: `PyTorch Lightning Documentation `__""" + """Training step: `PyTorch Lightning Documentation `__""" # noqa: E501 # Get batch information labels = batch["labels"] @@ -741,11 +754,11 @@ def training_step(self, batch, batch_idx): # skipcq: PYL-W0613 def validation_step(self, batch, batch_idx): # skipcq: PYL-W0613 """ Validation step: `PyTorch Lightning Documentation `__ - Similar to :meth:`~extractive.ExtractiveSummarizer.training_step` in that in runs the inputs - through the model. However, this method also calculates accuracy and f1 score by marking every - sentence score >0.5 as 1 (meaning should be in the summary) and each score <0.5 as 0 (meaning - should not be in the summary). - """ + Similar to :meth:`~extractive.ExtractiveSummarizer.training_step` in that in runs the + inputs through the model. However, this method also calculates accuracy and f1 score by + marking every sentence score >0.5 as 1 (meaning should be in the summary) and each score + <0.5 as 0 (meaning should not be in the summary). + """ # noqa: E501 # Get batch information labels = batch["labels"] @@ -795,7 +808,7 @@ def validation_epoch_end(self, outputs): """ Called at the end of a validation epoch: `PyTorch Lightning Documentation `__ Finds the mean of all the metrics logged by :meth:`~extractive.ExtractiveSummarizer.validation_step`. - """ + """ # noqa: E501 # Get the average loss and accuracy metrics over all evaluation runs avg_loss_total = torch.stack([x["val_loss_total"] for x in outputs]).mean() avg_loss_total_norm_batch = torch.stack( @@ -832,16 +845,17 @@ def validation_epoch_end(self, outputs): def test_step(self, batch, batch_idx): """ Test step: `PyTorch Lightning Documentation `__ - Similar to :meth:`~extractive.ExtractiveSummarizer.validation_step` in that in runs the inputs - through the model. However, this method also calculates the ROUGE scores for each example-summary - pair. - """ + Similar to :meth:`~extractive.ExtractiveSummarizer.validation_step` in that in runs the + inputs through the model. However, this method also calculates the ROUGE scores for each + example-summary pair. + """ # noqa: E501 # Get batch information labels = batch["labels"] sources = batch["source"] targets = batch["target"] - # delete labels, sources, and targets so now batch contains everything to be inputted into the model + # delete labels, sources, and targets so now batch contains everything to be inputted into + # the model del batch["labels"] del batch["source"] del batch["target"] @@ -922,7 +936,9 @@ def test_step(self, batch, batch_idx): for sent_idx, i in enumerate(source_ids): if i >= len(source): logger.debug( - "Only %i examples selected from document %i in batch %i. This is likely because some sentences received ranks so small they rounded to zero and a padding 'sentence' was randomly chosen.", + "Only %i examples selected from document %i in batch %i. This is likely " + + "because some sentences received ranks so small they rounded to zero " + + "and a padding 'sentence' was randomly chosen.", sent_idx + 1, idx, batch_idx, @@ -930,11 +946,11 @@ def test_step(self, batch, batch_idx): continue candidate = source[i].strip() - # If trigram blocking is enabled and searching for matching trigrams finds no matches - # then add the candidate to the current prediction list. - # During the predicting process, Trigram Blocking is used to reduce redundancy. Given - # selected summary S and a candidate sentence c, we will skip c is there exists a - # trigram overlapping between c and S. + # If trigram blocking is enabled and searching for matching trigrams finds no + # matches then add the candidate to the current prediction list. + # During the predicting process, Trigram Blocking is used to reduce redundancy. + # Given selected summary S and a candidate sentence c, we will skip c is there + # exists a trigram overlapping between c and S. if (not self.hparams.no_test_block_trigrams) and ( not block_trigrams(candidate, current_prediction) ): @@ -950,7 +966,7 @@ def test_step(self, batch, batch_idx): # See this issue https://github.com/google-research/google-research/issues/168 # for info about the differences between `pyrouge` and `rouge-score`. - # Archive Link: https://web.archive.org/web/20200622205503/https://github.com/google-research/google-research/issues/168 + # Archive Link: https://web.archive.org/web/20200622205503/https://github.com/google-research/google-research/issues/168 # noqa: E501 if self.hparams.test_use_pyrouge: # Convert `current_prediction` from list to string with a "" between each # item/sentence. In ROUGE 1.5.5 (`pyrouge`), a "\n" indicates sentence @@ -992,7 +1008,7 @@ def test_epoch_end(self, outputs): """ Called at the end of a testing epoch: `PyTorch Lightning Documentation `__ Finds the mean of all the metrics logged by :meth:`~extractive.ExtractiveSummarizer.test_step`. - """ + """ # noqa: E501 # Get the accuracy metrics over all testing runs avg_test_acc = torch.stack([x["test_acc"] for x in outputs]).mean() avg_test_f1 = torch.stack([x["test_f1"] for x in outputs]).mean() @@ -1024,7 +1040,7 @@ def test_epoch_end(self, outputs): # and values that are `AggregateScore` objects. Each `AggregateScore` object is a # named tuple with a low, mid, and high value. Each value is a `Score` object, which # is also a named tuple, that contains the precision, recall, and fmeasure values. - # For more info see the source code: https://github.com/google-research/google-research/blob/master/rouge/scoring.py + # For more info see the source code: https://github.com/google-research/google-research/blob/master/rouge/scoring.py # noqa: E501 rouge_result = aggregator.aggregate() for metric, value in rouge_result.items(): @@ -1176,7 +1192,9 @@ def add_model_specific_args(parent_parser): "--model_name_or_path", type=str, default="bert-base-uncased", - help="Path to pre-trained model or shortcut name. A list of shortcut names can be found at https://huggingface.co/transformers/pretrained_models.html. Community-uploaded models are located at https://huggingface.co/models.", + help="Path to pre-trained model or shortcut name. A list of shortcut names can be " + + "found at https://huggingface.co/transformers/pretrained_models.html. " + + "Community-uploaded models are located at https://huggingface.co/models.", ) parser.add_argument( "--model_type", @@ -1188,7 +1206,8 @@ def add_model_specific_args(parent_parser): parser.add_argument( "--tokenizer_no_use_fast", action="store_true", - help="Don't use the fast version of the tokenizer for the specified model. More info: https://huggingface.co/transformers/main_classes/tokenizer.html.", + help="Don't use the fast version of the tokenizer for the specified model. " + + "More info: https://huggingface.co/transformers/main_classes/tokenizer.html.", ) parser.add_argument( "--max_seq_length", @@ -1204,11 +1223,11 @@ def add_model_specific_args(parent_parser): default="none", type=str, choices=["txt", "pt", "none"], - help="""The file extension of the prepared data. The 'map' `--dataloader_type` - requires `txt` and the 'iterable' `--dataloader_type` works with both. If the data - is not prepared yet (in JSON format) this value specifies the output format - after processing. If the data is prepared, this value specifies the format to load. - If it is `none` then the type of data to be loaded will be inferred from the + help="""The file extension of the prepared data. The 'map' `--dataloader_type` + requires `txt` and the 'iterable' `--dataloader_type` works with both. If the data + is not prepared yet (in JSON format) this value specifies the output format + after processing. If the data is prepared, this value specifies the format to load. + If it is `none` then the type of data to be loaded will be inferred from the `data_path`. If data needs to be prepared, this cannot be set to `none`.""", ) parser.add_argument("--num_threads", type=int, default=4) @@ -1243,10 +1262,10 @@ def add_model_specific_args(parent_parser): "--dataloader_num_workers", default=4, type=int, - help="""The number of workers to use when loading data. A general place to - start is to set num_workers equal to the number of CPU cores on your machine. - If `--dataloader_type` is 'iterable' then this setting has no effect and - num_workers will be 1. More details here: https://pytorch-lightning.readthedocs.io/en/latest/performance.html#num-workers""", + help="""The number of workers to use when loading data. A general place to + start is to set num_workers equal to the number of CPU cores on your machine. + If `--dataloader_type` is 'iterable' then this setting has no effect and + num_workers will be 1. More details here: https://pytorch-lightning.readthedocs.io/en/latest/performance.html#num-workers""", # noqa: E501 ) parser.add_argument( "--processor_no_bert_compatible_cls", @@ -1263,7 +1282,9 @@ def add_model_specific_args(parent_parser): parser.add_argument( "--preprocess_resume", action="store_true", - help='Resume preprocessing. `--only_preprocess` must be set in order to resume. Determines which files to process by finding the shards that do not have a coresponding ".pt" file in the data directory.', + help="Resume preprocessing. `--only_preprocess` must be set in order to resume. " + + "Determines which files to process by finding the shards that do not have a " + + 'coresponding ".pt" file in the data directory.', ) parser.add_argument( "--create_token_type_ids", @@ -1289,7 +1310,7 @@ def add_model_specific_args(parent_parser): `nn.TransformerEncoderLayer`s and then a simple `nn.Linear` layer. `transformer_linear` - a `TransformerEncoderClassifier` with a `LinearClassifier` as the `reduction` parameter, which results in the same thing as the `transformer` option but with a - `LinearClassifier` instead of a `nn.Linear` layer.""", + `LinearClassifier` instead of a `nn.Linear` layer.""", # noqa: E501 ) parser.add_argument( "--classifier_dropout", @@ -1301,7 +1322,8 @@ def add_model_specific_args(parent_parser): "--classifier_transformer_num_layers", type=int, default=2, - help='The number of layers for the `transformer` classifier. Only has an effect if `--classifier` contains "transformer".', + help="The number of layers for the `transformer` classifier. Only has an effect if " + + '`--classifier` contains "transformer".', ) parser.add_argument( "--train_name", @@ -1332,12 +1354,14 @@ def add_model_specific_args(parent_parser): "--test_k", type=float, default=3, - help="The `k` parameter for the `--test_id_method`. Must be set if using the `greater_k` option. (default: 3)", + help="The `k` parameter for the `--test_id_method`. Must be set if using the " + + "`greater_k` option. (default: 3)", ) parser.add_argument( "--no_test_block_trigrams", action="store_true", - help="Disable trigram blocking when calculating ROUGE scores during testing. This will increase repetition and thus decrease accuracy.", + help="Disable trigram blocking when calculating ROUGE scores during testing. " + + "This will increase repetition and thus decrease accuracy.", ) parser.add_argument( "--test_use_pyrouge", @@ -1347,7 +1371,7 @@ def add_model_specific_args(parent_parser): package installed. More details about ROUGE 1.5.5 here: https://github.com/andersjo/pyrouge/tree/master/tools/ROUGE-1.5.5. It is recommended to use this option for official scores. The `ROUGE-L` measurements from `pyrouge` are equivalent to the `rougeLsum` measurements from the default - `rouge-score` package.""", + `rouge-score` package.""", # noqa: E501 ) parser.add_argument( "--loss_key", @@ -1360,6 +1384,8 @@ def add_model_specific_args(parent_parser): "loss_avg", ], default="loss_avg_seq_mean", - help="Which reduction method to use with BCELoss. See the `experiments/loss_functions/` folder for info on how the default (`loss_avg_seq_mean`) was chosen.", + help="Which reduction method to use with BCELoss. See the " + + "`experiments/loss_functions/` folder for info on how the default " + + "(`loss_avg_seq_mean`) was chosen.", ) return parser diff --git a/src/helpers.py b/src/helpers.py index fa1edcc..6243854 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -1,17 +1,18 @@ -import os -import math -import time -import shutil -import json import gzip +import json import logging +import math +import os +import shutil +import time +from functools import partial + +import numpy as np import pytorch_lightning as pl import torch +import torch.nn.functional as F import torch_optimizer -import numpy as np from torch import nn -import torch.nn.functional as F -from functools import partial from torch.utils.data import Sampler logger = logging.getLogger(__name__) @@ -93,7 +94,7 @@ def lr_lambda_func(current_step, num_warmup_steps, num_training_steps): def block_trigrams(candidate, prediction): - """Decrease repetition in summaries by checking if a trigram from ``prediction`` + """Decrease repetition in summaries by checking if a trigram from ``prediction`` exists in ``candidate`` Args: @@ -142,9 +143,12 @@ def _get_word_ngrams(n, sentences): def pad(data, pad_id, width=None, pad_on_left=False, nearest_multiple_of=False): - """Pad ``data`` with ``pad_id`` to ``width`` on the right by default but if ``pad_on_left`` then left.""" + """ + Pad ``data`` with ``pad_id`` to ``width`` on the right by default but if + ``pad_on_left`` then left. + """ if not width: - width = max([len(d) for d in data]) + width = max(len(d) for d in data) if nearest_multiple_of: width = math.ceil(width / nearest_multiple_of) * nearest_multiple_of if pad_on_left: @@ -157,9 +161,12 @@ def pad(data, pad_id, width=None, pad_on_left=False, nearest_multiple_of=False): def pad_tensors( tensors, pad_id=0, width=None, pad_on_left=False, nearest_multiple_of=False ): - """Pad ``tensors`` with ``pad_id`` to ``width`` on the right by default but if ``pad_on_left`` then left.""" + """ + Pad ``tensors`` with ``pad_id`` to ``width`` on the right by default but + if ``pad_on_left`` then left. + """ if not width: - width = max([len(d) for d in tensors]) + width = max(len(d) for d in tensors) if nearest_multiple_of: width = math.ceil(width / nearest_multiple_of) * nearest_multiple_of if pad_on_left: @@ -170,25 +177,28 @@ def pad_tensors( value=pad_id, ) return F.pad( - tensors, pad=(0, (width - tensors.size()[-1])), mode="constant", value=pad_id, + tensors, + pad=(0, (width - tensors.size()[-1])), + mode="constant", + value=pad_id, ) def test_rouge(temp_dir, cand, ref): r"""Compute ROUGE scores using the official ROUGE 1.5.5 package. This function uses the - ``pyrouge`` python module to interface with the office ROUGE script. There should be a - "" token between each sentence in the ``cand`` and ``ref`` files. ``pyrouge`` splits - sentences based on newlines but we cannot store all the summaries easily in a single text - file if there is a newline between each sentence since newlines mark new summaries. Thus, + ``pyrouge`` python module to interface with the office ROUGE script. There should be a + "" token between each sentence in the ``cand`` and ``ref`` files. ``pyrouge`` splits + sentences based on newlines but we cannot store all the summaries easily in a single text + file if there is a newline between each sentence since newlines mark new summaries. Thus, the "" token is used in the text files and is converted to a newline in this function. Using "" instead of ``\\n`` also makes it easier to store the ground-truth summaries in the ``convert_to_extractive.py`` script. Args: temp_dir (str): A temporary folder to store files for input to the ROUGE script. - cand (str): The path to the file containing one candidate summary per line with + cand (str): The path to the file containing one candidate summary per line with "" tokens in between each sentence. - ref (str): The path to the file containing one ground-truth/gold summary per line + ref (str): The path to the file containing one ground-truth/gold summary per line with "" tokens in between each sentence. Returns: @@ -244,7 +254,7 @@ class LabelSmoothingLoss(nn.Module): KL-divergence between q_{smoothed ground truth prob.}(w) and p_{prob. computed by model}(w) is minimized. From OpenNMT with modifications: https://github.com/OpenNMT/OpenNMT-py/blob/e8622eb5c6117269bb3accd8eb6f66282b5e67d9/onmt/utils/loss.py#L186 - """ + """ # noqa: E501 def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100): assert 0.0 < label_smoothing <= 1.0 @@ -275,7 +285,7 @@ def forward(self, output, target): # https://github.com/huggingface/transformers/blob/dc31a72f505bc115a2214a68c8ea7c956f98fd1b/examples/seq2seq/utils.py#L158 class SortishSampler(Sampler): """ - Go through the text data by order of src length with a bit of randomness. + Go through the text data by order of src length with a bit of randomness. From fastai repo with modifications. """ @@ -369,7 +379,7 @@ def generic_configure_optimizers(hparams, train_dataloader, params_to_update): # the global_step every gradient accumulation cycle. Therefore, the # scheduler needs to have `accumulate_grad_batches` * `max_steps` in # order to reach `max_steps`. - # See: https://github.com/PyTorchLightning/pytorch-lightning/blob/f293c9b5f4b4f9fabb2eec0c369f08a66c57ef14/pytorch_lightning/trainer/training_loop.py#L624 + # See: https://github.com/PyTorchLightning/pytorch-lightning/blob/f293c9b5f4b4f9fabb2eec0c369f08a66c57ef14/pytorch_lightning/trainer/training_loop.py#L624 # noqa: E501 t_total = hparams.max_steps * hparams.accumulate_grad_batches else: t_total = int( @@ -404,10 +414,10 @@ def generic_configure_optimizers(hparams, train_dataloader, params_to_update): if hparams.use_scheduler: if hparams.use_scheduler == "linear": # We have to import the function and create a partial because functions cannot be - # serialized by python pickle. Therefore, if the normal `get_linear_schedule_with_warmup` - # function provided by `transformers` was used, the program would fail to save - # `hparams` because the optimizer would contain a locale function that cannot be - # pickled. + # serialized by python pickle. Therefore, if the normal + # `get_linear_schedule_with_warmup` function provided by `transformers` was used, + # the program would fail to save `hparams` because the optimizer would contain a + # locale function that cannot be pickled. lr_lambda = partial( lr_lambda_func, num_warmup_steps=hparams.warmup_steps * hparams.accumulate_grad_batches, @@ -434,7 +444,8 @@ def generic_configure_optimizers(hparams, train_dataloader, params_to_update): ) else: logger.error( - "The value %s for `--use_scheduler` is invalid.", hparams.use_scheduler, + "The value %s for `--use_scheduler` is invalid.", + hparams.use_scheduler, ) # the below interval is called "step" but the scheduler is moved forward # every batch. diff --git a/src/main.py b/src/main.py index 1e105f8..e5cc093 100644 --- a/src/main.py +++ b/src/main.py @@ -1,19 +1,21 @@ +import json import logging -import torch -import numpy as np import random -import json -import datasets as nlp -from pytorch_lightning import Trainer -from extractive import ExtractiveSummarizer -from abstractive import AbstractiveSummarizer -from helpers import StepCheckpointCallback from argparse import ArgumentParser -from pytorch_lightning.loggers import WandbLogger -from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint + +import numpy as np +import torch +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import LearningRateMonitor +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint +from pytorch_lightning.loggers import WandbLogger from pytorch_lightning.plugins import DeepSpeedPlugin +import datasets as nlp +from abstractive import AbstractiveSummarizer +from extractive import ExtractiveSummarizer +from helpers import StepCheckpointCallback + logger = logging.getLogger(__name__) @@ -24,7 +26,10 @@ def set_seed(seed): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False logger.warning( - "Deterministic mode can have a performance impact, depending on your model. This means that due to the deterministic nature of the model, the processing speed (i.e. processed batch items per second) can be lower than when the model is non-deterministic." + "Deterministic mode can have a performance impact, depending on your model. This means " + + "that due to the deterministic nature of the model, the processing speed (i.e. " + + "processed batch items per second) can be lower than when the model is " + + "non-deterministic." ) @@ -55,12 +60,10 @@ def main(args): or "word_embedding_model.embeddings.position_ids" in e_str ): print( - ( - "The below is a common issue. Due to the `transformers` update " - "from 3.0.2 to 3.1.0, models trained in versions <3.0.2 need to be " - "loaded with the `--no_strict` argument. More details can be found at " - "huggingface/transformers#6882." - ) + "The below is a common issue. Due to the `transformers` update " + "from 3.0.2 to 3.1.0, models trained in versions <3.0.2 need to be " + "loaded with the `--no_strict` argument. More details can be found at " + "huggingface/transformers#6882." ) raise e @@ -87,7 +90,9 @@ def main(args): if args.use_custom_checkpoint_callback: args.checkpoint_callback = ModelCheckpoint( - save_top_k=-1, period=1, verbose=True, + save_top_k=-1, + period=1, + verbose=True, ) if args.custom_checkpoint_every_n: custom_checkpoint_callback = StepCheckpointCallback( @@ -111,7 +116,8 @@ def main(args): new_lr = lr_finder.suggestion() logger.info("Recommended Learning Rate: %s", new_lr) - # remove `args.callbacks` if it exists so it does not get saved with the model (would result in crash) + # remove `args.callbacks` if it exists so it does not get saved with the model + # (would result in crash) if args.custom_checkpoint_every_n: del args.callbacks @@ -133,17 +139,19 @@ def main(args): help="Extractive or abstractive summarization training. Default is 'extractive'.", ) parser.add_argument( - "--default_root_dir", type=str, help="Default path for logs and weights.", + "--default_root_dir", + type=str, + help="Default path for logs and weights.", ) parser.add_argument( "--weights_save_path", type=str, - help="""Where to save weights if specified. Will override `--default_root_dir` for - checkpoints only. Use this if for whatever reason you need the checkpoints stored in + help="""Where to save weights if specified. Will override `--default_root_dir` for + checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `--default_root_dir`. If you are using the `wandb` logger, then you must also set `--no_wandb_logger_log_model` - when using this option. Model weights are saved with the wandb logs to be uploaded to - wandb.ai by default. Setting this option without setting `--no_wandb_logger_log_model` + when using this option. Model weights are saved with the wandb logs to be uploaded to + wandb.ai by default. Setting this option without setting `--no_wandb_logger_log_model` effectively creates two save paths, which may crash the script.""", ) parser.add_argument( @@ -202,7 +210,8 @@ def main(args): "--overfit_batches", default=0.0, type=float, - help="Uses this much data of all datasets (training, validation, test). Useful for quickly debugging or trying to overfit on purpose.", + help="Uses this much data of all datasets (training, validation, test). Useful " + + "for quickly debugging or trying to overfit on purpose.", ) parser.add_argument( "--fast_dev_run", @@ -213,13 +222,15 @@ def main(args): "--limit_train_batches", default=1.0, type=float, - help="How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch.", + help="How much of training dataset to check. Useful when debugging or testing " + + "something that happens at the end of an epoch.", ) parser.add_argument( "--limit_val_batches", default=1.0, type=float, - help="How much of validation dataset to check. Useful when debugging or testing something that happens at the end of an epoch.", + help="How much of validation dataset to check. Useful when debugging or testing something " + + "that happens at the end of an epoch.", ) parser.add_argument( "--limit_test_batches", @@ -231,7 +242,8 @@ def main(args): "--amp_level", type=str, default="O1", - help="The optimization level to use (O1, O2, etc…) for 16-bit GPU precision (using NVIDIA apex under the hood).", + help="The optimization level to use (O1, O2, etc…) for 16-bit GPU precision (using " + + "NVIDIA apex under the hood).", ) parser.add_argument( "--precision", @@ -256,25 +268,32 @@ def main(args): "--progress_bar_refresh_rate", default=50, type=int, - help="How often to refresh progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more.", + help="How often to refresh progress bar (in steps). In notebooks, faster refresh rates " + + "(lower number) is known to crash them because of their screen refresh rates, so raise " + + "it to 50 or more.", ) parser.add_argument( "--num_sanity_val_steps", default=2, type=int, - help="Sanity check runs n batches of val before starting the training routine. This catches any bugs in your validation without having to wait for the first validation check.", + help="Sanity check runs n batches of val before starting the training routine. This " + + "catches any bugs in your validation without having to wait for the first " + + "validation check.", ) parser.add_argument( "--val_check_interval", default=1.0, - help="How often within one training epoch to check the validation set. Can specify as float or int. Use float to check within a training epoch. Use int to check every n steps (batches).", + help="How often within one training epoch to check the validation set. Can specify " + + "as float or int. Use float to check within a training epoch. Use int to check every " + + "n steps (batches).", ) parser.add_argument( "--use_logger", default="wandb", type=str, choices=["tensorboard", "wandb"], - help="Which program to use for logging. If `wandb` is chosen then model weights will automatically be uploaded to wandb.ai.", + help="Which program to use for logging. If `wandb` is chosen then model weights " + + "will automatically be uploaded to wandb.ai.", ) parser.add_argument( "--wandb_project", @@ -285,7 +304,9 @@ def main(args): parser.add_argument( "--gradient_checkpointing", action="store_true", - help="Enable gradient checkpointing (save memory at the expense of a slower backward pass) for the word embedding model. More info: https://github.com/huggingface/transformers/pull/4659#issue-424841871", + help="Enable gradient checkpointing (save memory at the expense of a slower backward " + + "pass) for the word embedding model. " + + "More info: https://github.com/huggingface/transformers/pull/4659#issue-424841871", ) parser.add_argument( "--accelerator", @@ -304,8 +325,8 @@ def main(args): "--load_weights", default=False, type=str, - help="""Loads the model weights from a given checkpoint. Hyperparameters are initialized - from command line arguments. This can be used to change paramters between the training + help="""Loads the model weights from a given checkpoint. Hyperparameters are initialized + from command line arguments. This can be used to change paramters between the training and testing stages, for example.""", ) parser.add_argument( @@ -318,56 +339,63 @@ def main(args): "--resume_from_checkpoint", default=None, type=str, - help="To resume training from a specific checkpoint pass in the path here. Automatically restores model, epoch, step, LR schedulers, apex, etc...", + help="To resume training from a specific checkpoint pass in the path here. Automatically " + + "restores model, epoch, step, LR schedulers, apex, etc...", ) parser.add_argument( "--use_custom_checkpoint_callback", action="store_true", - help="""Use the custom checkpointing callback specified in `main()` by - `args.checkpoint_callback`. By default this custom callback saves the model every - epoch and never deletes the saved weights files. You can change the save path by + help="""Use the custom checkpointing callback specified in `main()` by + `args.checkpoint_callback`. By default this custom callback saves the model every + epoch and never deletes the saved weights files. You can change the save path by setting the `--weights_save_path` option.""", ) parser.add_argument( "--custom_checkpoint_every_n", type=int, default=None, - help="""The number of steps between additional checkpoints. By default checkpoints are saved - every epoch. Setting this value will save them every epoch and every N steps. This does not - use the same callback as `--use_custom_checkpoint_callback` but instead uses a different class - called `StepCheckpointCallback`. When using this callback, you must specify the save path with the - `--weights_save_path` option.""", + help="""The number of steps between additional checkpoints. By default checkpoints are + saved every epoch. Setting this value will save them every epoch and every N steps. This + does not use the same callback as `--use_custom_checkpoint_callback` but instead uses a + different class called `StepCheckpointCallback`. When using this callback, you must specify + the save path with the `--weights_save_path` option.""", ) parser.add_argument( "--no_wandb_logger_log_model", action="store_true", - help="Only applies when using the `wandb` logger. Set this argument to NOT save checkpoints in wandb directory to upload to W&B servers.", + help="Only applies when using the `wandb` logger. Set this argument to NOT save " + + "checkpoints in wandb directory to upload to W&B servers.", ) parser.add_argument( "--auto_scale_batch_size", default=None, type=str, - help="""Auto scaling of batch size may be enabled to find the largest batch size that fits - into memory. Larger batch size often yields better estimates of gradients, but may also - result in longer training time. Currently, this feature supports two modes 'power' scaling - and 'binsearch' scaling. In 'power' scaling, starting from a batch size of 1 keeps doubling - the batch size until an out-of-memory (OOM) error is encountered. Setting the argument to + help="""Auto scaling of batch size may be enabled to find the largest batch size that fits + into memory. Larger batch size often yields better estimates of gradients, but may also + result in longer training time. Currently, this feature supports two modes 'power' scaling + and 'binsearch' scaling. In 'power' scaling, starting from a batch size of 1 keeps doubling + the batch size until an out-of-memory (OOM) error is encountered. Setting the argument to 'binsearch' continues to finetune the batch size by performing a binary search. 'binsearch' is the recommended option.""", ) parser.add_argument( "--lr_find", action="store_true", - help="Runs a learning rate finder algorithm (see https://arxiv.org/abs/1506.01186) before any training, to find optimal initial learning rate.", + help="Runs a learning rate finder algorithm (see https://arxiv.org/abs/1506.01186) " + + "before any training, to find optimal initial learning rate.", ) parser.add_argument( - "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", + "--adam_epsilon", + default=1e-8, + type=float, + help="Epsilon for Adam optimizer.", ) parser.add_argument( "--optimizer_type", type=str, default="adam", - help="""Which optimizer to use: `adamw` (default), `ranger`, `qhadam`, `radam`, or `adabound`.""", + help="""Which optimizer to use: `adamw` (default), `ranger`, `qhadam`, `radam`, or + `adabound`.""", ) parser.add_argument( "--ranger-k", @@ -375,7 +403,8 @@ def main(args): type=int, help="""Ranger (LookAhead) optimizer k value (default: 6). LookAhead keeps a single extra copy of the weights, then lets the internalized 'faster' optimizer (for Ranger, - that's RAdam) explore for 5 or 6 batches. The batch interval is specified via the k parameter.""", + that's RAdam) explore for 5 or 6 batches. The batch interval is specified via the + k parameter.""", ) parser.add_argument( "--warmup_steps", @@ -386,7 +415,8 @@ def main(args): parser.add_argument( "--no_strict", action="store_false", - help="Load a model with `strict` mode disabled. This will *not* enforce that the keys in `state_dict` match the keys returned by the module's `state_dict()` function.", + help="Load a model with `strict` mode disabled. This will *not* enforce that the keys " + + "in `state_dict` match the keys returned by the module's `state_dict()` function.", ) parser.add_argument( "--use_scheduler", @@ -395,7 +425,7 @@ def main(args): 1. `linear`: Use a linear schedule that inceases linearly over `--warmup_steps` to `--learning_rate` then decreases linearly for the rest of the training process. 2. `onecycle`: Use the one cycle policy with a maximum learning rate of `--learning_rate`. (default: False, don't use any scheduler) - 3. `poly`: polynomial learning rate decay from `--learning_rate` to `--end_learning_rate`""", + 3. `poly`: polynomial learning rate decay from `--learning_rate` to `--end_learning_rate`""", # noqa: E501 ) parser.add_argument( "--end_learning_rate", @@ -408,7 +438,8 @@ def main(args): "--plugins", default=None, type=str, - help="Allows you to connect arbitrary backends. Run `pip install deepspeed mpi4py` to use deepspeed plugin.", + help="Allows you to connect arbitrary backends. Run `pip install deepspeed mpi4py` to use" + + "deepspeed plugin.", ) parser.add_argument( "-l", @@ -437,7 +468,8 @@ def main(args): and (":" not in main_args[0].plugins) ): logger.error( - "If you are using the 'deepspeed' plugin, you must specify the path the to deepspeed config like so: `--plugins deepspeed:/path/to/config.json`." + "If you are using the 'deepspeed' plugin, you must specify the path the to " + + "deepspeed config like so: `--plugins deepspeed:/path/to/config.json`." ) main_args = parser.parse_args() @@ -450,7 +482,8 @@ def main(args): # Set the `nlp` logging verbosity since its default is not INFO. # If the verbosity is not set back to the default for the library, an abundance - # of output will be printed. See https://huggingface.co/docs/datasets/package_reference/logging_methods.html. + # of output will be printed. + # See https://huggingface.co/docs/datasets/package_reference/logging_methods.html. nlp.logging.set_verbosity(nlp.logging.WARNING) # Train diff --git a/src/poly_lr_decay.py b/src/poly_lr_decay.py index 06aed4f..4523eca 100644 --- a/src/poly_lr_decay.py +++ b/src/poly_lr_decay.py @@ -5,11 +5,12 @@ class PolynomialLRDecay(_LRScheduler): """Polynomial learning rate decay until step reach to max_decay_step - + Args: optimizer (Optimizer): Wrapped optimizer. max_decay_steps: after this step, we stop decreasing learning rate - end_learning_rate: scheduler stoping learning rate decay, value of learning rate must be this value + end_learning_rate: scheduler stoping learning rate decay, value of learning rate must be + this value power: The power of the polynomial. """ diff --git a/src/pooling.py b/src/pooling.py index 3288d46..a5d8f13 100644 --- a/src/pooling.py +++ b/src/pooling.py @@ -36,19 +36,19 @@ def forward( Args: word_vectors (torch.Tensor, optional): Vectors representing words created by a ``word_embedding_model``. Defaults to None. - sent_rep_token_ids (torch.Tensor, optional): See :meth:`extractive.ExtractiveSummarizer.forward`. - Defaults to None. - sent_rep_mask (torch.Tensor, optional): See :meth:`extractive.ExtractiveSummarizer.forward`. - Defaults to None. - sent_lengths (torch.Tensor, optional): See :meth:`extractive.ExtractiveSummarizer.forward`. - Defaults to None. - sent_lengths_mask (torch.Tensor, optional): See :meth:`extractive.ExtractiveSummarizer.forward`. - Defaults to None. + sent_rep_token_ids (torch.Tensor, optional): See + :meth:`extractive.ExtractiveSummarizer.forward`. Defaults to None. + sent_rep_mask (torch.Tensor, optional): See + :meth:`extractive.ExtractiveSummarizer.forward`. Defaults to None. + sent_lengths (torch.Tensor, optional): See + :meth:`extractive.ExtractiveSummarizer.forward`. Defaults to None. + sent_lengths_mask (torch.Tensor, optional): See + :meth:`extractive.ExtractiveSummarizer.forward`. Defaults to None. Returns: tuple: (output_vector, output_mask) Contains the sentence scores and mask as - ``torch.Tensor``\ s. The mask is either the ``sent_rep_mask`` or ``sent_lengths_mask`` - depending on the pooling mode used during model initialization. + ``torch.Tensor``\ s. The mask is either the ``sent_rep_mask`` or + ``sent_lengths_mask`` depending on the pooling mode used during model initialization. """ output_vectors = [] output_masks = [] @@ -77,7 +77,8 @@ def forward( ) # if the sequence contains values that are not zero if ((sequence != 0).sum() != 0) - # any tensor with 2 dimensions (one being the hidden size) that has already been created (will be set to zero from padding) + # any tensor with 2 dimensions (one being the hidden size) that has already + # been created (will be set to zero from padding) else word_vectors[0, 0].float() # for each sentence for sequence in sequences