Skip to content

Commit

Permalink
Require ICU (#164)
Browse files Browse the repository at this point in the history
By requiring ICU, we can remove all hand crafted Unicode
implementations that are sometimes incomplete (e.g. 32a4cdc).
  • Loading branch information
guillaumekln authored Sep 8, 2020
1 parent 92fbc42 commit 453fb2c
Show file tree
Hide file tree
Showing 18 changed files with 165 additions and 1,295 deletions.
9 changes: 4 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ compiler:
env:
global:
- SENTENCEPIECE_VERSION="0.1.8"
matrix:
- WITH_ICU="ON"
- WITH_ICU="OFF"
cache:
directories:
- $HOME/sentencepiece-$SENTENCEPIECE_VERSION/
Expand Down Expand Up @@ -35,15 +32,17 @@ install:
- export TOKENIZER_ROOT=$HOME/Tokenizer
- export SENTENCEPIECE_ROOT=$HOME/sentencepiece-$SENTENCEPIECE_VERSION
- mkdir build && cd build
- cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT -DWITH_ICU=$WITH_ICU ..
- cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT ..
- make install
- cd $ROOT_TRAVIS_DIR
script:
- build/test/onmt_tokenizer_test test/data

matrix:
include:
- env:
- name: C++ tests
- name: Python tests
env:
- TWINE_REPOSITORY_URL="https://upload.pypi.org/legacy/"
services:
- docker
Expand Down
19 changes: 6 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ endif()

project(OpenNMTTokenizer)

option(WITH_ICU "Compile with ICU" OFF)
option(BUILD_TESTS "Compile unit tests" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries" ON)

Expand All @@ -30,13 +29,15 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
endif()

find_package(ICU REQUIRED)

set(INCLUDE_DIRECTORIES
${CMAKE_CURRENT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}
${ICU_INCLUDE_DIRS}
)

set(PUBLIC_HEADERS
include/onmt/Alphabet.h
include/onmt/Token.h
include/onmt/BPE.h
include/onmt/BPELearner.h
Expand All @@ -48,7 +49,6 @@ set(PUBLIC_HEADERS
)

set(SOURCES
src/Alphabet.cc
src/BPE.cc
src/BPELearner.cc
src/Casing.cc
Expand All @@ -62,16 +62,9 @@ set(SOURCES
src/unicode/Unicode.cc
)

list(APPEND LINK_LIBRARIES "")

if (WITH_ICU)
find_package(ICU REQUIRED)
add_definitions(-DWITH_ICU)
list(APPEND INCLUDE_DIRECTORIES ${ICU_INCLUDE_DIRS})
list(APPEND LINK_LIBRARIES ${ICU_LIBRARIES})
else()
list(APPEND SOURCES src/unicode/Data.cc)
endif()
list(APPEND LINK_LIBRARIES
${ICU_LIBRARIES}
)

find_library(SP_LIBRARY NAMES sentencepiece)
find_path(SP_INCLUDE_DIR NAMES sentencepiece_processor.h)
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ See the `-h` flag to list the available options.

### Dependencies

* [ICU](http://site.icu-project.org/)
* (optional) [SentencePiece](https://github.com/google/sentencepiece)
* (optional) [ICU](http://site.icu-project.org/)

### Compiling

Expand All @@ -87,7 +87,6 @@ make
It will produce the dynamic library `libOpenNMTTokenizer` and tokenization clients in `cli/`.

* To compile only the library, use the `-DLIB_ONLY=ON` flag.
* To compile with the ICU unicode backend, use the `-DWITH_ICU=ON` flag.

### Testing

Expand Down
2 changes: 1 addition & 1 deletion bindings/python/tools/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ cd $ROOT_DIR
# Build Tokenizer.
mkdir build
cd build
cmake -DLIB_ONLY=ON -DWITH_ICU=ON ..
cmake -DLIB_ONLY=ON ..
make -j2 install
cd $ROOT_DIR

Expand Down
2 changes: 1 addition & 1 deletion docs/options.md
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ $ echo "1234" | cli/tokenize --mode aggressive --segment_numbers

### `segment_alphabet` (list of strings, default: `[]`)

List of alphabets for which to split all letters. A complete list of supported alphabets is available in the source file [`Alphabet.h`](../include/onmt/Alphabet.h).
List of alphabets for which to split all letters (can be any [Unicode script alias](https://en.wikipedia.org/wiki/Script_(Unicode))).

```bash
$ echo "測試 abc" | cli/tokenize --segment_alphabet Han
Expand Down
224 changes: 0 additions & 224 deletions include/onmt/Alphabet.h

This file was deleted.

4 changes: 2 additions & 2 deletions include/onmt/Tokenizer.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include <unordered_map>
#include <set>
#include <unordered_set>

#include "onmt/opennmttokenizer_export.h"
#include "onmt/ITokenizer.h"
Expand Down Expand Up @@ -150,7 +150,7 @@ namespace onmt
const SubwordEncoder* _subword_encoder;
std::string _joiner;

std::set<int> _segment_alphabet;
std::unordered_set<int> _segment_alphabet;

void read_flags(int flags);

Expand Down
24 changes: 0 additions & 24 deletions include/onmt/unicode/Data.h

This file was deleted.

Loading

0 comments on commit 453fb2c

Please sign in to comment.