-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
619 additions
and
351 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[run] | ||
source=./tests/ | ||
omit=./venv/ | ||
|
||
[report] | ||
precision=2 | ||
show_missing=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
__pycache__/ | ||
build/ | ||
dist/ | ||
venv/ | ||
.coverage | ||
.gitignore | ||
.dockerignore | ||
.travis.yml | ||
Dockerfile | ||
LICENSE | ||
README.md | ||
*.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[flake8] | ||
ignore = | ||
F401 | ||
E501 | ||
exclude = | ||
build | ||
venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,8 @@ docs/_build/ | |
# PyBuilder | ||
target/ | ||
|
||
#Ipython Notebook | ||
# Ipython Notebook | ||
.ipynb_checkpoints | ||
|
||
dev_* | ||
# Mac | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[nosetests] | ||
verbosity=1 | ||
detailed-errors=1 | ||
with-coverage=1 | ||
cover-erase=1 | ||
cover-package=lexrankr | ||
exclude-dir=./venv/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
language: python | ||
|
||
services: | ||
- docker | ||
|
||
before_install: | ||
- docker build -t lexrankr -f Dockerfile . | ||
|
||
script: | ||
- docker run -e COVERALLS_REPO_TOKEN="$COVERALLS_REPO_TOKEN" lexrankr bash -c 'nosetests --config=.noserc && coveralls' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# ubuntu | ||
FROM theeluwin/ubuntu-konlpy:latest | ||
LABEL maintainer="Jamie Seol <theeluwin@gmail.com>" | ||
|
||
# init | ||
RUN mkdir -p /workspace | ||
WORKDIR /workspace | ||
|
||
# install packages | ||
RUN pip install -U pip | ||
RUN pip install setuptools networkx nose nose-exclude flake8 coverage coveralls requests | ||
|
||
# install this package | ||
ADD . /workspace/ | ||
RUN python setup.py build && \ | ||
python setup.py install | ||
|
||
# run test | ||
ENTRYPOINT [] | ||
CMD ["nosetests", "--config=.noserc"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,76 @@ | ||
LexRank for Korean | ||
========== | ||
# lexrankr | ||
|
||
Text summarization using [LexRank][1] algorithm for Korean. | ||
Click [here][2] to see how to install [KoNLPy][3] properly. | ||
[older version][4] using [TextRank][5]. | ||
[![Build Status](https://travis-ci.org/theeluwin/lexrankr.svg?branch=main)](https://travis-ci.org/theeluwin/lexrankr) | ||
[![Coverage Status](https://coveralls.io/repos/github/theeluwin/lexrankr/badge.svg?branch=main)](https://coveralls.io/github/theeluwin/lexrankr?branch=main) | ||
[![PyPI version](https://badge.fury.io/py/lexrankr.svg)](https://badge.fury.io/py/lexrankr) | ||
|
||
See related paper: [lexrankr: LexRank 기반 한국어 다중 문서 요약][6] | ||
Clustering based multi-document selective text summarization using [LexRank](http://dl.acm.org/citation.cfm?id=1622501) algorithm. | ||
|
||
This repository is a source code for the paper [설진석, 이상구. "lexrankr: LexRank 기반 한국어 다중 문서 요약." 한국정보과학회 학술발표논문집 (2016): 458-460](http://www.eiric.or.kr/community/post2.php?m=view&gubun=201612&num=6769). | ||
|
||
Installation | ||
----- | ||
* Mostly designed for Korean, but not limited to. | ||
* Click [here](http://konlpy.org/en/latest/install/) to see how to install [KoNLPy](http://konlpy.org/) properly. | ||
* Check out [textrankr](https://github.com/theeluwin/textrankr), which is a simpler summarizer using [TextRank](http://digital.library.unt.edu/ark:/67531/metadc30962/). | ||
|
||
```sh | ||
## Installation | ||
|
||
```bash | ||
pip install lexrankr | ||
``` | ||
|
||
Usage | ||
----- | ||
## Tokenizers | ||
|
||
Tokenizers are not included. You have to implement one by yourself. | ||
|
||
Example: | ||
|
||
```python | ||
from typing import List | ||
|
||
class MyTokenizer: | ||
def __call__(self, text: str) -> List[str]: | ||
tokens: List[str] = text.split() | ||
return tokens | ||
``` | ||
|
||
한국어의 경우 [KoNLPy](http://konlpy.org)를 사용하는 방법이 있습니다. | ||
|
||
```python | ||
from __future__ import print_function | ||
from typing import List | ||
from konlpy.tag import Okt | ||
|
||
class OktTokenizer: | ||
okt: Okt = Okt() | ||
|
||
def __call__(self, text: str) -> List[str]: | ||
tokens: List[str] = self.okt.pos(text, norm=True, stem=True, join=True) | ||
return tokens | ||
``` | ||
|
||
## Usage | ||
|
||
```python | ||
from typing import List | ||
from lexrankr import LexRank | ||
|
||
lexrank = LexRank() # can init with various settings | ||
# 1. init | ||
mytokenizer: MyTokenizer = MyTokenizer() | ||
lexrank: LexRank = LexRank(mytokenizer) | ||
|
||
# 2. summarize (like, pre-computation) | ||
lexrank.summarize(your_text_here) | ||
summaries = lexrank.probe(num_summaries) # `num_summaries` can be `None` (using auto-detected topics) | ||
|
||
# 3. probe (like, query-time) | ||
summaries: List[str] = lexrank.probe() | ||
for summary in summaries: | ||
print(summary) | ||
``` | ||
|
||
## Test | ||
|
||
Test | ||
----- | ||
Use docker. | ||
|
||
```bash | ||
python -m tests.test | ||
docker build -t lexrankr -f Dockerfile . | ||
docker run --rm -it lexrankr | ||
``` | ||
|
||
[1]: http://dl.acm.org/citation.cfm?id=1622501 | ||
[2]: http://konlpy.org/en/latest/install/ | ||
[3]: http://konlpy.org/ | ||
[4]: https://github.com/theeluwin/textrankr | ||
[5]: http://digital.library.unt.edu/ark:/67531/metadc30962/ | ||
[6]: http://www.eiric.or.kr/community/post2.php?m=view&gubun=201612&num=6769&pg=51&seGubun=&seGubun1=&SnxGubun=%C6%F7%BD%BA%C5%CD&searchBy=&searchWord= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
__title__ = 'lexrankr' | ||
__version__ = '0.1' | ||
__author__ = 'Jamie Seol' | ||
__license__ = 'MIT' | ||
__copyright__ = 'Copyright 2016 Jamie Seol' | ||
|
||
from .lexrankr import * | ||
|
||
|
||
__title__: str = 'lexrankr' | ||
__version__: str = '1.0' | ||
__author__: str = 'Jamie Seol' | ||
__license__: str = 'MIT' | ||
__copyright__: str = 'Copyright 2021 Jamie Seol' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from typing import ( | ||
List, | ||
Tuple, | ||
Iterator, | ||
) | ||
|
||
from gensim.corpora import ( | ||
Dictionary, | ||
TextCorpus, | ||
) | ||
|
||
from .sentence import Sentence | ||
|
||
|
||
__all__: Tuple[str, ...] = ( | ||
'SentenceCorpus', | ||
) | ||
|
||
|
||
class SentenceCorpus(TextCorpus): | ||
""" | ||
Args: | ||
sentences: a list of `sentence.Sentence` instances. | ||
no_below: ignore unique tokens with inverse document count below this value (int). | ||
no_above: ignore unique tokens with inverse document frequency above this value (float). | ||
max_size: maximum vocabulary size. | ||
See `gensim.corpora.TextCorpus` for more details. | ||
""" | ||
|
||
def __init__(self, | ||
sentences: List[Sentence], | ||
no_below: int = 3, | ||
no_above: float = 0.8, | ||
max_size: int = 20000 | ||
): | ||
|
||
# preserver original sentences | ||
self.sentences: List[Sentence] = sentences | ||
|
||
# init dictionary | ||
self.dictionary: Dictionary = Dictionary(self.get_texts(), prune_at=max_size) | ||
self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_size) | ||
self.dictionary.compactify() | ||
|
||
# precompute bows | ||
self.bows: List[List[Tuple[int, int]]] = [] | ||
for tokens in self.get_texts(): | ||
bow: List[Tuple[int, int]] = self.dictionary.doc2bow(tokens) | ||
self.bows.append(bow) | ||
|
||
def get_texts(self) -> Iterator[List[str]]: | ||
for sentence in self.sentences: | ||
yield sentence.tokens |
Oops, something went wrong.