-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some renamings to match the Python API
- Loading branch information
1 parent
f091893
commit 9487fdf
Showing
10 changed files
with
415 additions
and
445 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#pragma once | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "onmt/opennmttokenizer_export.h" | ||
|
||
namespace onmt | ||
{ | ||
|
||
enum class Casing | ||
{ | ||
None, | ||
Lowercase, | ||
Uppercase, | ||
Mixed, | ||
Capitalized, | ||
}; | ||
|
||
std::pair<std::string, Casing> lowercase_token(const std::string& token); | ||
std::string restore_token_casing(const std::string& token, Casing casing); | ||
|
||
char casing_to_char(Casing type); | ||
Casing char_to_casing(char feature); | ||
|
||
enum class CaseMarkupType | ||
{ | ||
None, | ||
Modifier, | ||
RegionBegin, | ||
RegionEnd, | ||
}; | ||
|
||
Casing get_casing_from_markup(const std::string& markup); | ||
CaseMarkupType read_case_markup(const std::string& markup); | ||
std::string write_case_markup(CaseMarkupType markup, Casing casing); | ||
|
||
struct TokenCaseMarkup | ||
{ | ||
TokenCaseMarkup(CaseMarkupType prefix_, CaseMarkupType suffix_, Casing casing_) | ||
: prefix(prefix_) | ||
, suffix(suffix_) | ||
, casing(casing_) | ||
{ | ||
} | ||
CaseMarkupType prefix; | ||
CaseMarkupType suffix; | ||
Casing casing; | ||
}; | ||
|
||
class Token; | ||
|
||
// In "soft" mode, this function tries to minimize the number of uppercase regions by possibly | ||
// including case invariant characters (numbers, symbols, etc.) in uppercase regions. | ||
std::vector<TokenCaseMarkup> get_case_markups(const std::vector<Token>& tokens, | ||
const bool soft = true); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.