Skip to content

Commit

Permalink
Some renamings to match the Python API
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumekln committed Aug 3, 2020
1 parent f091893 commit 9487fdf
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 445 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ set(PUBLIC_HEADERS
include/onmt/Token.h
include/onmt/BPE.h
include/onmt/BPELearner.h
include/onmt/CaseModifier.h
include/onmt/Casing.h
include/onmt/ITokenizer.h
include/onmt/SpaceTokenizer.h
include/onmt/SubwordEncoder.h
Expand All @@ -47,7 +47,7 @@ set(SOURCES
src/Alphabet.cc
src/BPE.cc
src/BPELearner.cc
src/CaseModifier.cc
src/Casing.cc
src/ITokenizer.cc
src/SpaceTokenizer.cc
src/SubwordEncoder.cc
Expand Down
14 changes: 7 additions & 7 deletions bindings/python/Python.cc
Original file line number Diff line number Diff line change
Expand Up @@ -440,12 +440,12 @@ PYBIND11_MODULE(pyonmttok, m)
{
m.def("is_placeholder", &onmt::Tokenizer::is_placeholder, py::arg("token"));

py::enum_<onmt::CaseModifier::Type>(m, "Casing")
.value("LOWERCASE", onmt::CaseModifier::Type::Lowercase)
.value("UPPERCASE", onmt::CaseModifier::Type::Uppercase)
.value("MIXED", onmt::CaseModifier::Type::Mixed)
.value("CAPITALIZED", onmt::CaseModifier::Type::Capitalized)
.value("NONE", onmt::CaseModifier::Type::None)
py::enum_<onmt::Casing>(m, "Casing")
.value("NONE", onmt::Casing::None)
.value("LOWERCASE", onmt::Casing::Lowercase)
.value("UPPERCASE", onmt::Casing::Uppercase)
.value("MIXED", onmt::Casing::Mixed)
.value("CAPITALIZED", onmt::Casing::Capitalized)
.export_values();

py::enum_<onmt::TokenType>(m, "TokenType")
Expand All @@ -464,7 +464,7 @@ PYBIND11_MODULE(pyonmttok, m)
.def_readwrite("spacer", &onmt::Token::spacer)
.def_readwrite("preserve", &onmt::Token::preserve)
.def_readwrite("features", &onmt::Token::features)
.def_readwrite("casing", &onmt::Token::case_type)
.def_readwrite("casing", &onmt::Token::casing)
.def("__eq__", &onmt::Token::operator==)
;

Expand Down
66 changes: 0 additions & 66 deletions include/onmt/CaseModifier.h

This file was deleted.

58 changes: 58 additions & 0 deletions include/onmt/Casing.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#pragma once

#include <string>
#include <vector>

#include "onmt/opennmttokenizer_export.h"

namespace onmt
{

enum class Casing
{
None,
Lowercase,
Uppercase,
Mixed,
Capitalized,
};

std::pair<std::string, Casing> lowercase_token(const std::string& token);
std::string restore_token_casing(const std::string& token, Casing casing);

char casing_to_char(Casing type);
Casing char_to_casing(char feature);

enum class CaseMarkupType
{
None,
Modifier,
RegionBegin,
RegionEnd,
};

Casing get_casing_from_markup(const std::string& markup);
CaseMarkupType read_case_markup(const std::string& markup);
std::string write_case_markup(CaseMarkupType markup, Casing casing);

struct TokenCaseMarkup
{
TokenCaseMarkup(CaseMarkupType prefix_, CaseMarkupType suffix_, Casing casing_)
: prefix(prefix_)
, suffix(suffix_)
, casing(casing_)
{
}
CaseMarkupType prefix;
CaseMarkupType suffix;
Casing casing;
};

class Token;

// In "soft" mode, this function tries to minimize the number of uppercase regions by possibly
// including case invariant characters (numbers, symbols, etc.) in uppercase regions.
std::vector<TokenCaseMarkup> get_case_markups(const std::vector<Token>& tokens,
const bool soft = true);

}
11 changes: 3 additions & 8 deletions include/onmt/Token.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include "onmt/opennmttokenizer_export.h"
#include "onmt/unicode/Unicode.h"
#include "onmt/CaseModifier.h"
#include "onmt/Casing.h"

namespace onmt
{
Expand All @@ -22,7 +22,7 @@ namespace onmt
public:
std::string surface;
TokenType type = TokenType::Word;
CaseModifier::Type case_type = CaseModifier::Type::None;
Casing casing = Casing::None;
bool join_left = false;
bool join_right = false;
bool spacer = false;
Expand All @@ -48,11 +48,6 @@ namespace onmt
return unicode::utf8len(surface);
}

bool has_case() const
{
return case_type != CaseModifier::Type::None;
}

void append_feature(std::string feature)
{
features.emplace_back(std::move(feature));
Expand All @@ -67,7 +62,7 @@ namespace onmt
{
return (surface == other.surface
&& type == other.type
&& case_type == other.case_type
&& casing == other.casing
&& join_left == other.join_left
&& join_right == other.join_right
&& spacer == other.spacer
Expand Down
4 changes: 2 additions & 2 deletions src/BPE.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <limits>

#include "onmt/unicode/Unicode.h"
#include "onmt/CaseModifier.h"
#include "onmt/Casing.h"

namespace onmt
{
Expand Down Expand Up @@ -106,7 +106,7 @@ namespace onmt
std::vector<std::string> chars;

if (_case_insensitive)
unicode::explode_utf8_with_marks(CaseModifier::extract_case(str).first, chars);
unicode::explode_utf8_with_marks(lowercase_token(str).first, chars);
else
unicode::explode_utf8_with_marks(str, chars);

Expand Down
Loading

0 comments on commit 9487fdf

Please sign in to comment.