major update (so many changes...)

theeluwin · Feb 12, 2021 · 157e073 · 157e073
1 parent a6fd911
commit 157e073
Show file tree

Hide file tree

Showing 18 changed files with 619 additions and 351 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,7 @@
+[run]
+source=./tests/
+omit=./venv/
+
+[report]
+precision=2
+show_missing=1
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,12 @@
+__pycache__/
+build/
+dist/
+venv/
+.coverage
+.gitignore
+.dockerignore
+.travis.yml
+Dockerfile
+LICENSE
+README.md
+*.egg-info
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore =
+    F401
+    E501
+exclude =
+    build
+    venv
diff --git a/.gitignore b/.gitignore
@@ -59,7 +59,8 @@ docs/_build/
 # PyBuilder
 target/
 
-#Ipython Notebook
+# Ipython Notebook
 .ipynb_checkpoints
 
-dev_*
+# Mac
+.DS_Store
diff --git a/.noserc b/.noserc
@@ -0,0 +1,7 @@
+[nosetests]
+verbosity=1
+detailed-errors=1
+with-coverage=1
+cover-erase=1
+cover-package=lexrankr
+exclude-dir=./venv/
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,10 @@
+language: python
+
+services:
+  - docker
+
+before_install:
+  - docker build -t lexrankr -f Dockerfile .
+
+script:
+  - docker run -e COVERALLS_REPO_TOKEN="$COVERALLS_REPO_TOKEN" lexrankr bash -c 'nosetests --config=.noserc && coveralls'
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,20 @@
+# ubuntu
+FROM theeluwin/ubuntu-konlpy:latest
+LABEL maintainer="Jamie Seol <theeluwin@gmail.com>"
+
+# init
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# install packages
+RUN pip install -U pip
+RUN pip install setuptools networkx nose nose-exclude flake8 coverage coveralls requests
+
+# install this package
+ADD . /workspace/
+RUN python setup.py build && \
+	python setup.py install
+
+# run test
+ENTRYPOINT []
+CMD ["nosetests", "--config=.noserc"]
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2016 Jamie J Seol
+Copyright (c) 2021 Jamie J Seol
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,45 +1,76 @@
-LexRank for Korean
-==========
+# lexrankr
 
-Text summarization using [LexRank][1] algorithm for Korean.
-Click [here][2] to see how to install [KoNLPy][3] properly.
-[older version][4] using [TextRank][5].
+[![Build Status](https://travis-ci.org/theeluwin/lexrankr.svg?branch=main)](https://travis-ci.org/theeluwin/lexrankr)
+[![Coverage Status](https://coveralls.io/repos/github/theeluwin/lexrankr/badge.svg?branch=main)](https://coveralls.io/github/theeluwin/lexrankr?branch=main)
+[![PyPI version](https://badge.fury.io/py/lexrankr.svg)](https://badge.fury.io/py/lexrankr)
 
-See related paper: [lexrankr: LexRank 기반 한국어 다중 문서 요약][6]
+Clustering based multi-document selective text summarization using [LexRank](http://dl.acm.org/citation.cfm?id=1622501) algorithm.
 
+This repository is a source code for the paper [설진석, 이상구. "lexrankr: LexRank 기반 한국어 다중 문서 요약." 한국정보과학회 학술발표논문집 (2016): 458-460](http://www.eiric.or.kr/community/post2.php?m=view&gubun=201612&num=6769).
 
-Installation
------
+* Mostly designed for Korean, but not limited to.
+* Click [here](http://konlpy.org/en/latest/install/) to see how to install [KoNLPy](http://konlpy.org/) properly.
+* Check out [textrankr](https://github.com/theeluwin/textrankr), which is a simpler summarizer using [TextRank](http://digital.library.unt.edu/ark:/67531/metadc30962/).
 
-```sh
+## Installation
+
+```bash
 pip install lexrankr
 ```
 
-Usage
------
+## Tokenizers
+
+Tokenizers are not included. You have to implement one by yourself.
+
+Example:
+
+```python
+from typing import List
+
+class MyTokenizer:
+    def __call__(self, text: str) -> List[str]:
+        tokens: List[str] = text.split()
+        return tokens
+```
+
+한국어의 경우 [KoNLPy](http://konlpy.org)를 사용하는 방법이 있습니다.
 
 ```python
-from __future__ import print_function
+from typing import List
+from konlpy.tag import Okt
+
+class OktTokenizer:
+    okt: Okt = Okt()
+
+    def __call__(self, text: str) -> List[str]:
+        tokens: List[str] = self.okt.pos(text, norm=True, stem=True, join=True)
+        return tokens
+```
+
+## Usage
+
+```python
+from typing import List
 from lexrankr import LexRank
 
-lexrank = LexRank()  # can init with various settings
+# 1. init
+mytokenizer: MyTokenizer = MyTokenizer()
+lexrank: LexRank = LexRank(mytokenizer)
+
+# 2. summarize (like, pre-computation)
 lexrank.summarize(your_text_here)
-summaries = lexrank.probe(num_summaries)  # `num_summaries` can be `None` (using auto-detected topics)
+
+# 3. probe (like, query-time)
+summaries: List[str] = lexrank.probe()
 for summary in summaries:
     print(summary)
 ```
 
+## Test
 
-Test
------
+Use docker.
 
 ```bash
-python -m tests.test
+docker build -t lexrankr -f Dockerfile .
+docker run --rm -it lexrankr
 ```
-
-[1]: http://dl.acm.org/citation.cfm?id=1622501
-[2]: http://konlpy.org/en/latest/install/
-[3]: http://konlpy.org/
-[4]: https://github.com/theeluwin/textrankr
-[5]: http://digital.library.unt.edu/ark:/67531/metadc30962/
-[6]: http://www.eiric.or.kr/community/post2.php?m=view&gubun=201612&num=6769&pg=51&seGubun=&seGubun1=&SnxGubun=%C6%F7%BD%BA%C5%CD&searchBy=&searchWord=
diff --git a/lexrankr/__init__.py b/lexrankr/__init__.py
@@ -1,7 +1,8 @@
-__title__ = 'lexrankr'
-__version__ = '0.1'
-__author__ = 'Jamie Seol'
-__license__ = 'MIT'
-__copyright__ = 'Copyright 2016 Jamie Seol'
-
 from .lexrankr import *
+
+
+__title__: str = 'lexrankr'
+__version__: str = '1.0'
+__author__: str = 'Jamie Seol'
+__license__: str = 'MIT'
+__copyright__: str = 'Copyright 2021 Jamie Seol'
diff --git a/lexrankr/corpus.py b/lexrankr/corpus.py
@@ -0,0 +1,54 @@
+from typing import (
+    List,
+    Tuple,
+    Iterator,
+)
+
+from gensim.corpora import (
+    Dictionary,
+    TextCorpus,
+)
+
+from .sentence import Sentence
+
+
+__all__: Tuple[str, ...] = (
+    'SentenceCorpus',
+)
+
+
+class SentenceCorpus(TextCorpus):
+    """
+        Args:
+            sentences: a list of `sentence.Sentence` instances.
+            no_below: ignore unique tokens with inverse document count below this value (int).
+            no_above: ignore unique tokens with inverse document frequency above this value (float).
+            max_size: maximum vocabulary size.
+
+        See `gensim.corpora.TextCorpus` for more details.
+    """
+
+    def __init__(self,
+                 sentences: List[Sentence],
+                 no_below: int = 3,
+                 no_above: float = 0.8,
+                 max_size: int = 20000
+                 ):
+
+        # preserver original sentences
+        self.sentences: List[Sentence] = sentences
+
+        # init dictionary
+        self.dictionary: Dictionary = Dictionary(self.get_texts(), prune_at=max_size)
+        self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_size)
+        self.dictionary.compactify()
+
+        # precompute bows
+        self.bows: List[List[Tuple[int, int]]] = []
+        for tokens in self.get_texts():
+            bow: List[Tuple[int, int]] = self.dictionary.doc2bow(tokens)
+            self.bows.append(bow)
+
+    def get_texts(self) -> Iterator[List[str]]:
+        for sentence in self.sentences:
+            yield sentence.tokens