Skip to content

Commit

Permalink
Support new merges serialization (#131)
Browse files Browse the repository at this point in the history
* Support new merges serialization

Introduced in tokenizers 0.20.0. Tokenizers saved with it will create a
`merges` property where each merge is an array of two items, instead of
a string with a separator.

* nit
  • Loading branch information
pcuenca authored Sep 27, 2024
1 parent 0f23067 commit 4f31334
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions Sources/Tokenizers/BPETokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,27 @@ class BPETokenizer: PreTrainedTokenizerModel {

public let fuseUnknownTokens: Bool

static func mergesFromConfig(_ config: Config?) -> [[String]]? {
guard let config = config else { return nil }

// New format (pushed with tokenizers >= 0.20.0): each merge is a list of 2 items
if let merges = config.value as? [[String]] { return merges }

// Legacy: each merge is a string
guard let merges = config.value as? [String] else { return nil }
return merges.map { mergeString in
mergeString.unicodeScalars.split(separator: " ", omittingEmptySubsequences: false).map { String($0) }
}
}

required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") }
guard let merges = Self.mergesFromConfig(tokenizerData.model?.merges) else { fatalError("BPETokenizer requires merges") }
guard let vocab = tokenizerData.model?.vocab?.dictionary as? [NSString: Int] else {
throw TokenizerError.missingVocab
}
var bpeRanks: Dictionary<BytePair, Int> = [:]
for (i, item) in merges.enumerated() {
let tuple = item.unicodeScalars.split(separator: " ", omittingEmptySubsequences: false).map { String($0) }
let bp = BytePair(tuple: tuple)
for (i, merge) in merges.enumerated() {
let bp = BytePair(tuple: merge)
bpeRanks[bp] = i
}
self.bpeRanks = bpeRanks
Expand Down

0 comments on commit 4f31334

Please sign in to comment.