-
Notifications
You must be signed in to change notification settings - Fork 70
/
Tokenizer.h
210 lines (178 loc) · 7.53 KB
/
Tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#pragma once
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include "onmt/opennmttokenizer_export.h"
#include "onmt/ITokenizer.h"
#include "onmt/Token.h"
namespace onmt
{
void OPENNMTTOKENIZER_EXPORT set_random_seed(const unsigned int seed);
class SubwordEncoder;
class OPENNMTTOKENIZER_EXPORT Tokenizer: public ITokenizer
{
public:
enum class Mode
{
Conservative,
Aggressive,
Char,
Space,
None
};
static Mode str_to_mode(const std::string& mode);
static std::string mode_to_str(const Mode mode);
// See https://github.com/OpenNMT/Tokenizer/blob/master/docs/options.md for more details.
struct Options
{
Mode mode = Mode::Conservative;
std::string lang;
bool no_substitution = false;
bool case_feature = false;
bool case_markup = false;
bool soft_case_regions = false;
bool with_separators = false;
bool allow_isolated_marks = false;
bool joiner_annotate = false;
bool joiner_new = false;
std::string joiner;
bool spacer_annotate = false;
bool spacer_new = false;
bool preserve_placeholders = false;
bool preserve_segmented_tokens = false;
bool support_prior_joiners = false;
bool segment_case = false;
bool segment_numbers = false;
bool segment_alphabet_change = false;
std::vector<std::string> segment_alphabet;
Options() = default;
Options(Mode mode, int legacy_flags, const std::string& joiner = joiner_marker);
void validate();
private:
bool add_alphabet_to_segment(const std::string& alphabet);
std::unordered_set<int> segment_alphabet_codes;
friend class Tokenizer;
};
static const std::string joiner_marker;
static const std::string spacer_marker;
static const std::string ph_marker_open;
static const std::string ph_marker_close;
static const std::string escaped_character_prefix;
static const size_t escaped_character_width;
Tokenizer(Options options,
const std::shared_ptr<const SubwordEncoder>& subword_encoder = nullptr);
using ITokenizer::tokenize;
using ITokenizer::detokenize;
void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
bool training = true) const override;
void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>& alphabets,
bool training = true) const override;
void tokenize(const std::string& text,
std::vector<Token>& annotated_tokens,
bool training = true) const;
Token annotate_token(const std::string& word) const;
void annotate_tokens(const std::vector<std::string>& words,
const std::vector<std::vector<std::string>>& features,
std::vector<Token>& tokens) const;
void finalize_tokens(const std::vector<Token>& annotated_tokens,
std::vector<std::string>& tokens,
std::vector<std::vector<std::string>>& features) const;
std::string detokenize(const std::vector<Token>& tokens) const;
std::string detokenize(const std::vector<Token>& tokens,
Ranges& ranges, bool merge_ranges = false) const;
std::string detokenize(const std::vector<std::string>& words,
const std::vector<std::vector<std::string> >& features) const override;
std::string detokenize(const std::vector<std::string>& words,
const std::vector<std::vector<std::string> >& features,
Ranges& ranges, bool merge_ranges = false) const override;
void set_subword_encoder(const std::shared_ptr<const SubwordEncoder>& subword_encoder);
const std::shared_ptr<const SubwordEncoder>& get_subword_encoder() const
{
return _subword_encoder;
}
const Options& get_options() const
{
return _options;
}
private:
Options _options;
std::shared_ptr<const SubwordEncoder> _subword_encoder;
void tokenize_on_placeholders(const std::string& text,
std::vector<Token>& annotated_tokens) const;
void tokenize_text(const std::string& text,
std::vector<Token>& annotated_tokens,
std::unordered_map<std::string, size_t>* alphabets) const;
void tokenize(const std::string& text,
std::vector<Token>& annotated_tokens,
std::unordered_map<std::string, size_t>* alphabets,
bool training) const;
void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>* alphabets,
bool training) const;
std::string detokenize(const std::vector<Token>& tokens,
Ranges* ranges,
bool merge_ranges = false,
const std::vector<size_t>* index_map = nullptr) const;
std::string detokenize(const std::vector<std::string>& words,
const std::vector<std::vector<std::string> >& features,
Ranges* ranges, bool merge_ranges = false) const;
void parse_tokens(const std::vector<std::string>& words,
const std::vector<std::vector<std::string>>& features,
std::vector<Token>& tokens,
std::vector<size_t>* index_map = nullptr) const;
public:
// The symbols below are deprecated but kept for backward compatibility.
enum Flags
{
None = 0,
CaseFeature = 1 << 0,
JoinerAnnotate = 1 << 1,
JoinerNew = 1 << 2,
WithSeparators = 1 << 3,
SegmentCase = 1 << 4,
SegmentNumbers = 1 << 5,
SegmentAlphabetChange = 1 << 6,
CacheBPEModel = 1 << 7, // Deprecated.
NoSubstitution = 1 << 8, // Do not replace special characters.
SpacerAnnotate = 1 << 9,
CacheModel = 1 << 10, // Deprecated.
SentencePieceModel = 1 << 11,
PreservePlaceholders = 1 << 12,
SpacerNew = 1 << 13,
PreserveSegmentedTokens = 1 << 14,
CaseMarkup = 1 << 15,
SupportPriorJoiners = 1 << 16,
SoftCaseRegions = 1 << 17,
};
Tokenizer(Mode mode,
int flags = Flags::None,
const std::string& model_path = "",
const std::string& joiner = joiner_marker,
const std::string& vocab_path = "",
int vocab_threshold = 50);
// External subword encoder constructor.
// Note: the tokenizer takes ownership of the subword_encoder pointer.
Tokenizer(Mode mode,
const SubwordEncoder* subword_encoder,
int flags = Flags::None,
const std::string& joiner = joiner_marker);
// SentencePiece-specific constructor.
Tokenizer(const std::string& sp_model_path,
int sp_nbest_size = 0,
float sp_alpha = 0.1,
Mode mode = Mode::None,
int flags = Flags::None,
const std::string& joiner = joiner_marker);
Tokenizer& set_joiner(const std::string& joiner);
void unset_annotate();
bool add_alphabet_to_segment(const std::string& alphabet);
static bool is_placeholder(const std::string& str);
};
}