Require ICU (#164)

By requiring ICU, we can remove all hand crafted Unicode implementations that are sometimes incomplete (e.g. 32a4cdc).
OpenNMT · Sep 8, 2020 · 453fb2c · 453fb2c
1 parent 92fbc42
commit 453fb2c
Show file tree

Hide file tree

Showing 18 changed files with 165 additions and 1,295 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,9 +5,6 @@ compiler:
 env:
   global:
     - SENTENCEPIECE_VERSION="0.1.8"
-  matrix:
-    - WITH_ICU="ON"
-    - WITH_ICU="OFF"
 cache:
   directories:
     - $HOME/sentencepiece-$SENTENCEPIECE_VERSION/
@@ -35,15 +32,17 @@ install:
   - export TOKENIZER_ROOT=$HOME/Tokenizer
   - export SENTENCEPIECE_ROOT=$HOME/sentencepiece-$SENTENCEPIECE_VERSION
   - mkdir build && cd build
-  - cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT -DWITH_ICU=$WITH_ICU ..
+  - cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$TOKENIZER_ROOT -DCMAKE_PREFIX_PATH=$SENTENCEPIECE_ROOT ..
   - make install
   - cd $ROOT_TRAVIS_DIR
 script:
   - build/test/onmt_tokenizer_test test/data
 
 matrix:
   include:
-    - env:
+    - name: C++ tests
+    - name: Python tests
+      env:
         - TWINE_REPOSITORY_URL="https://upload.pypi.org/legacy/"
       services:
         - docker

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,7 +7,6 @@ endif()
 
 project(OpenNMTTokenizer)
 
-option(WITH_ICU "Compile with ICU" OFF)
 option(BUILD_TESTS "Compile unit tests" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
@@ -30,13 +29,15 @@ else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 endif()
 
+find_package(ICU REQUIRED)
+
 set(INCLUDE_DIRECTORIES
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${PROJECT_BINARY_DIR}
+  ${ICU_INCLUDE_DIRS}
   )
 
 set(PUBLIC_HEADERS
-  include/onmt/Alphabet.h
   include/onmt/Token.h
   include/onmt/BPE.h
   include/onmt/BPELearner.h
@@ -48,7 +49,6 @@ set(PUBLIC_HEADERS
   )
 
 set(SOURCES
-  src/Alphabet.cc
   src/BPE.cc
   src/BPELearner.cc
   src/Casing.cc
@@ -62,16 +62,9 @@ set(SOURCES
   src/unicode/Unicode.cc
   )
 
-list(APPEND LINK_LIBRARIES "")
-
-if (WITH_ICU)
-  find_package(ICU REQUIRED)
-  add_definitions(-DWITH_ICU)
-  list(APPEND INCLUDE_DIRECTORIES ${ICU_INCLUDE_DIRS})
-  list(APPEND LINK_LIBRARIES ${ICU_LIBRARIES})
-else()
-  list(APPEND SOURCES src/unicode/Data.cc)
-endif()
+list(APPEND LINK_LIBRARIES
+  ${ICU_LIBRARIES}
+  )
 
 find_library(SP_LIBRARY NAMES sentencepiece)
 find_path(SP_INCLUDE_DIR NAMES sentencepiece_processor.h)

diff --git a/README.md b/README.md
@@ -69,8 +69,8 @@ See the `-h` flag to list the available options.
 
 ### Dependencies
 
+* [ICU](http://site.icu-project.org/)
 * (optional) [SentencePiece](https://github.com/google/sentencepiece)
-* (optional) [ICU](http://site.icu-project.org/)
 
 ### Compiling
 
@@ -87,7 +87,6 @@ make
 It will produce the dynamic library `libOpenNMTTokenizer` and tokenization clients in `cli/`.
 
 * To compile only the library, use the `-DLIB_ONLY=ON` flag.
-* To compile with the ICU unicode backend, use the `-DWITH_ICU=ON` flag.
 
 ### Testing
 

diff --git a/bindings/python/tools/build_wheel.sh b/bindings/python/tools/build_wheel.sh
@@ -35,7 +35,7 @@ cd $ROOT_DIR
 # Build Tokenizer.
 mkdir build
 cd build
-cmake -DLIB_ONLY=ON -DWITH_ICU=ON ..
+cmake -DLIB_ONLY=ON ..
 make -j2 install
 cd $ROOT_DIR
 

diff --git a/docs/options.md b/docs/options.md
@@ -307,7 +307,7 @@ $ echo "1234" | cli/tokenize --mode aggressive --segment_numbers
 
 ### `segment_alphabet` (list of strings, default: `[]`)
 
-List of alphabets for which to split all letters. A complete list of supported alphabets is available in the source file [`Alphabet.h`](../include/onmt/Alphabet.h).
+List of alphabets for which to split all letters (can be any [Unicode script alias](https://en.wikipedia.org/wiki/Script_(Unicode))).
 
 ```bash
 $ echo "測試 abc" | cli/tokenize --segment_alphabet Han

diff --git a/include/onmt/Alphabet.h b/include/onmt/Alphabet.h
diff --git a/include/onmt/Tokenizer.h b/include/onmt/Tokenizer.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <unordered_map>
-#include <set>
+#include <unordered_set>
 
 #include "onmt/opennmttokenizer_export.h"
 #include "onmt/ITokenizer.h"
@@ -150,7 +150,7 @@ namespace onmt
     const SubwordEncoder* _subword_encoder;
     std::string _joiner;
 
-    std::set<int> _segment_alphabet;
+    std::unordered_set<int> _segment_alphabet;
 
     void read_flags(int flags);
 

diff --git a/include/onmt/unicode/Data.h b/include/onmt/unicode/Data.h