/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /** * Tokenizer layers. */ /* Original source: keras-nlp/tokenizer.py */ import { serialization, tensor, tidy } from '@tensorflow/tfjs-core'; import { Layer } from '../../engine/topology'; import { NotImplementedError, ValueError } from '../../errors'; import { BytePairTokenizerCache, bytesToUnicode, createStaticHashtable, removeStringsFromInputs, splitStringsForBpe } from './tokenizers_utils'; import { tensorToArr, tensorArrTo2DArr } from './utils'; /** * Base class for Tokenizers. * * Tokenizers in the tfjs library should all subclass this layer. * The class provides two core methods `tokenize()` and `detokenize()` for * going from plain text to sequences and back. A tokenizer is a subclass of * `Layer` and can be combined with other layers in a `tf.sequential` model. * * Subclassers should always implement the `tokenize()` method, which will also * be the default when calling the layer directly on inputs. * * Subclassers can optionally implement the `detokenize()` method if the * tokenization is reversible. Otherwise, this can be skipped. * * Subclassers should implement `get_vocabulary()`, `vocabulary_size()`, * `token_to_id()` and `id_to_token()` if applicable. For some simple * "vocab free" tokenizers, such as a whitespace splitter shown below, these * methods do not apply and can be skipped. * * Example: * * ```js * class WhitespaceSplitterTokenizer extends Tokenizer { * tokenize(inputs: Tensor): Tensor[] { * const stringInputs = inputs.dataSync() as unknown as string[]; * return stringInputs.map(input => Tensor(input.split(' '))); * } * * override detokenize(inputs: Tensor[]): Tensor { * const stringInputs = inputs.map( * input => input.dataSync() as unknown as string[]); * return Tensor(stringInputs.map(str => str.join(' '))); * } * } * * const tokenizer = new WhitespaceSplitterTokenizer(); * * tokenizer.tokenize(tensor(['this is a test']))[0].print(); * * tokenizer.detokenize([tensor(['this', 'is', 'a', 'test'])]).print(); * ``` */ export class Tokenizer extends Layer { /** * Transform tokens back into strings. * * @param inputs Input tensor. * @param kwargs Additional keyword arguments. */ detokenize(inputs) { throw new NotImplementedError(`No implementation of 'detokenize()' was found for ${this.constructor.name}.`); } /** * Get the tokenizer vocabulary as a list of strings terms. */ get vocabulary() { throw new NotImplementedError(`No implementation of 'vocabulary()' was found for ${this.constructor.name}.`); } /** * Returns the total size of the token id space. */ get vocabularySize() { throw new NotImplementedError(`No implementation of 'vocabularySize()' was found for ${this.constructor.name}.`); } /** * Convert an integer id to a string token. */ idToToken(id) { throw new NotImplementedError(`No implementation of 'idToToken()' was found for ${this.constructor.name}.`); } /** * Convert an integer id to a string token. */ tokenToId(token) { throw new NotImplementedError(`No implementation of 'tokenToId()' was found for ${this.constructor.name}.`); } call(inputs, { mode = 'tokenize' } = {}) { if (mode === 'tokenize') { if (inputs instanceof Array) { throw new ValueError(`tokenize expects Tensor, not Tensor[].`); } return this.tokenize(inputs); } if (mode === 'detokenize') { if (!(inputs instanceof Array)) { throw new ValueError(`detokenize expects Tensor[], not Tensor.`); } return this.detokenize(inputs); } throw new ValueError(`Input mode=${mode} is not supported.`); } } /** * Byte-pair encoding tokenizer layer. * * This BPE tokenizer provides the same functionality as the official GPT-2 * tokenizer. Given the same `vocabulary` which maps tokens to ids, and `merges` * which describes BPE merge rules, it should provide the same output as OpenAI * implementation (https://github.com/openai/gpt-2/blob/master/src/encoder.py). * * If input is a batch of strings (rank > 0): * By default, the layer will output a `Tensor[]`. * If `sequenceLength` is set, the layer will output a `Tensor[]` where all * inputs have been padded or truncated to `sequenceLength`. * * Examples: * * Tokenize * ```js * const vocabulary = new Map([['butter', 1], ['fly', 2]]); * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y']; * const tokenizer = new BytePairTokenizer({vocabulary, merges}); * * tokenizer.tokenize(tensor(['butterfly']))[0].print(); * tokenizer.tokenize(tensor(['butterfly, butter']))[1].print(); * ``` * * Detokenize * ```js * const vocabulary = new Map([['butter', 1], ['fly', 2]]); * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y']; * const tokenizer = new BytePairTokenizer({vocabulary, merges}); * * tokenizer.detokenize([[1, 2]]).print(); * ``` */ class BytePairTokenizer extends Tokenizer { constructor(args) { super(args); this.cache = new BytePairTokenizerCache(); this._vocabulary = new Map(args.vocabulary); this.merges = [...args.merges]; this.sequenceLength = args.sequenceLength || null; this.addPrefixSpace = args.addPrefixSpace || false; this.unsplittableTokens = args.unsplittableTokens || null; // Create byte <=> unicode mapping. This is useful for handling // whitespace tokens. const [byteList, unicodeList] = bytesToUnicode(); this.byte2Unicode = createStaticHashtable(Array.from(byteList), unicodeList, ''); if (this.unsplittableTokens) { // Put unsplittable tokens into cache, so it won't be further split and // merged. this.cache.insert(this.unsplittableTokens, this.unsplittableTokens); } // Create mapping between string tokens to int ids, and vice versa. const bytePairs = [...this._vocabulary.keys()]; const bytePairEncodingIndicies = [...this._vocabulary.values()]; this.tokenToIdMap = createStaticHashtable(bytePairs, bytePairEncodingIndicies, -1); this.idToTokenMap = createStaticHashtable(bytePairEncodingIndicies, bytePairs, ''); // Create ranking of merge rules, this is the same as order of merge pairs // in `this.merges`. this.mergeRanksLookupDefault = this.merges.length + 1; this.mergeRanks = createStaticHashtable(this.merges, [...Array(this.merges.length).keys()], this.mergeRanksLookupDefault); } /** * Get the tokenizer vocabulary as a list of string tokens. */ get vocabulary() { return [...this._vocabulary.keys()]; } /** * Get the size of the tokenizer vocabulary. */ get vocabularySize() { return this._vocabulary.size; } /** * Convert an integer id to a string token. */ idToToken(id) { // This will be slow, but keep memory usage down compared to building a // dict. Assuming the main use case is looking up a few special tokens // early in the vocab, this should be fine. const keys = this.vocabulary; for (const token of keys) { if (this._vocabulary.get(token) === id) { return token; } } return undefined; } /** * Convert a string token to an integer id. */ tokenToId(token) { return this._vocabulary.get(token); } getConfig() { const config = { vocabulary: Array.from(this._vocabulary.entries()), merges: this.merges, sequenceLength: this.sequenceLength, addPrefixSpace: this.addPrefixSpace, unsplittableTokens: this.unsplittableTokens, }; const baseConfig = super.getConfig(); Object.assign(config, baseConfig); return config; } /** * Perform one step of byte-pair merge. */ bpeMergeOneStep(words, mask) { const wordsStr = tensorArrTo2DArr(words); // Get all word pairs. const first = wordsStr.map(arr => arr.slice(0, -1)); const second = wordsStr.map(arr => arr.slice(1, arr.length)); // Mask empty. const nonEmptyMask = second.map(arr => arr.length > 0); mask = mask.map((a, idx) => a && nonEmptyMask[idx]); if (!mask.some(e => e)) { return [words, mask]; } const nonEmptyIndices = mask .map((bool, idx) => bool ? idx : -1) .filter(e => e !== -1); const filteredFirst = nonEmptyIndices.map(idx => first[idx]); const filteredSecond = nonEmptyIndices.map(idx => second[idx]); // Get byte pair ranking in merge rules. const pairs = filteredFirst.map((firstSubArr, idx) => { const secondSubArr = filteredSecond[idx]; return firstSubArr.map((char, idx) => `${char} ${secondSubArr[idx]}`); }); const pairRanksTensor = this.mergeRanks.lookup(pairs.map(arr => tensor(arr))); const pairRanks = tensorArrTo2DArr(pairRanksTensor); // Get BPE pair ranks. const minPairRank = pairRanks.map(arr => arr.reduce((a, b) => Math.min(a, b), Infinity)); const pairFoundMask = minPairRank.map(rank => rank !== this.mergeRanksLookupDefault); // Tokens that cannot be further merged are marked as finished. for (const [idx, index] of nonEmptyIndices.entries()) { const update = pairFoundMask[idx]; mask[index] = update; } if (!mask.some(e => e)) { return [words, mask]; } function argMin(arr) { return arr.indexOf(arr.reduce((a, b) => Math.min(a, b), Infinity)); } const maskedPairRanks = pairRanks.filter((_, idx) => pairFoundMask[idx]); const minPairRankIndices = maskedPairRanks.map(arr => argMin(arr)); // Get words and pairs to process. const unfinishedWords = wordsStr.filter((_, idx) => mask[idx]); const pairLeft = unfinishedWords.map((word, idx) => word[minPairRankIndices[idx]]); const pairRight = unfinishedWords.map((word, idx) => word[minPairRankIndices[idx] + 1]); const mergedPairs = pairLeft.map((left, idx) => { const right = pairRight[idx]; return `${left}${right}`; }); const unfinishedWordsIndices = mask .map((_, idx) => idx) .filter((_, idx) => mask[idx]); const mergedPairIndices = unfinishedWordsIndices.map((index, idx) => [index, minPairRankIndices[idx]]); const emptyStringIndices = unfinishedWordsIndices.map((index, idx) => [index, minPairRankIndices[idx] + 1]); for (const [idx, indices] of mergedPairIndices.entries()) { const [wordIdx, charIdx] = indices; const mergedPair = mergedPairs[idx]; wordsStr[wordIdx][charIdx] = mergedPair; } for (const indices of emptyStringIndices) { const [wordIdx, charIdx] = indices; wordsStr[wordIdx][charIdx] = ''; } words = wordsStr.map(word => tensor(word)); words = removeStringsFromInputs(words, ''); return [words, mask]; } /** * Perform byte-pair merge for each word in the inputs. */ bpeMerge(words) { const numWords = words.length; // Merge bytes. function loopCondition(mask) { return mask.some(e => e); } const initialMask = Array(numWords).fill(true); let mergedWords = words; let mask = initialMask; while (loopCondition(mask)) { [mergedWords, mask] = this.bpeMergeOneStep(mergedWords, mask); } return mergedWords; } /** * Map token bytes to unicode using `byte2unicode`. */ transformBytes(tokens) { const tokensStr = tensorToArr(tokens); const splitBytes = tokensStr.map(token => tensor(token.split('').map(c => c.charCodeAt(0)))); const splitUnicode = this.byte2Unicode.lookup(splitBytes); return splitUnicode; } /** * Process unseen tokens and add to cache. */ bpeMergeAndUpdateCache(tokens) { const words = this.transformBytes(tokens); const tokenizedWordsTensor = this.bpeMerge(words); const tokenizedWords = tensorArrTo2DArr(tokenizedWordsTensor); // For each word, join all its token by a whitespace, // e.g., ["dragon", "fly"] => "dragon fly" for hash purpose. const joinedTokens = tokenizedWords.map(word => word.join(' ')); this.cache.insert(tokens, joinedTokens); } tokenize(inputs) { return tidy(() => { if (this.addPrefixSpace) { const strInputs = tensorToArr(inputs); inputs = tensor(strInputs.map(word => ' ' + word)); } const rawTokensTensor = splitStringsForBpe(inputs, this.unsplittableTokens); const rawTokens = tensorArrTo2DArr(rawTokensTensor); const tokenRowSplits = [0]; for (const [idx, token] of rawTokens.entries()) { tokenRowSplits.push(tokenRowSplits[idx] + token.length); } const flatTokens = rawTokens.reduce((acc, e) => acc.concat(e), []); // Check cache. const cacheLookup = this.cache.lookup(flatTokens); const cacheMask = cacheLookup.map(e => e === ''); const hasUnseenWords = cacheMask.some((bool, idx) => bool && flatTokens[idx] !== ''); const processUnseenTokens = () => { const unseenTokens = flatTokens.filter((_, idx) => cacheMask[idx]); this.bpeMergeAndUpdateCache(tensor(unseenTokens)); return this.cache.lookup(flatTokens); }; // If `has_unseen_words == True`, it means not all tokens are in cache, // we will process the unseen tokens. Otherwise return the cache lookup. const tokenizedWords = hasUnseenWords ? processUnseenTokens() : cacheLookup; const tokensTensor = this.tokenToIdMap.lookup(tokenizedWords.map(word => tensor(word.split(' ')))); const tokens = tokensTensor.map(t => Array.from(t.dataSync())); // Unflatten to match input. const newTokenRowSplits = [0]; for (const [idx, token] of tokens.entries()) { newTokenRowSplits.push(newTokenRowSplits[idx] + token.length); } const newFlatTokens = tokens.reduce((acc, e) => acc.concat(e), []); const gatheredIndices = tokenRowSplits.map(index => newTokenRowSplits[index]); let tokens2D = []; for (let i = 0; i < gatheredIndices.length - 1; i++) { const [start, end] = [gatheredIndices[i], gatheredIndices[i + 1]]; const row = newFlatTokens.slice(start, end); tokens2D.push(tensor(row)); } // Convert to a dense output if `sequenceLength` is set. if (this.sequenceLength) { // pad or truncate tokens2D = tokens2D.map(t => { if (t.size === this.sequenceLength) { return t; } else if (t.size > this.sequenceLength) { return t.slice(0, this.sequenceLength); } else { return t.pad([[0, this.sequenceLength - t.size]]); } }); } return tokens2D; }); } detokenize(inputs) { const unicodeText = this.idToTokenMap.lookup(inputs) .map(t => tensorToArr(t).join('')); return tensor(unicodeText); } } /** @nocollapse */ BytePairTokenizer.className = 'BytePairTokenizer'; export { BytePairTokenizer }; serialization.registerClass(BytePairTokenizer); //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"tokenizers.js","sourceRoot":"","sources":["../../../../../../../tfjs-layers/src/layers/nlp/tokenizers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH;;GAEG;AAEH,6CAA6C;AAC7C,OAAO,EAAU,aAAa,EAAE,MAAM,EAAE,IAAI,EAAC,MAAM,uBAAuB,CAAC;AAE3E,OAAO,EAAE,KAAK,EAAa,MAAM,uBAAuB,CAAC;AACzD,OAAO,EAAE,mBAAmB,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAmB,cAAc,EAAE,qBAAqB,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACjK,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAMxD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AACH,MAAM,OAAgB,SAAU,SAAQ,KAAK;IAS3C;;;;;OAKG;IACH,UAAU,CAAC,MAAgB;QACzB,MAAM,IAAI,mBAAmB,CAC3B;QACE,IAAI,CAAC,WAAW,CAAC,IAAI,GAAG,CAC3B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,UAAU;QACZ,MAAM,IAAI,mBAAmB,CAC3B;QACE,IAAI,CAAC,WAAW,CAAC,IAAI,GAAG,CAC3B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,cAAc;QAChB,MAAM,IAAI,mBAAmB,CAC3B;QACE,IAAI,CAAC,WAAW,CAAC,IAAI,GAAG,CAC3B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,EAAU;QAClB,MAAM,IAAI,mBAAmB,CAC3B;QACE,IAAI,CAAC,WAAW,CAAC,IAAI,GAAG,CAC3B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,KAAa;QACrB,MAAM,IAAI,mBAAmB,CAC3B;QACE,IAAI,CAAC,WAAW,CAAC,IAAI,GAAG,CAC3B,CAAC;IACJ,CAAC;IAEQ,IAAI,CACX,MAAuB,EACvB,EAAC,IAAI,GAAG,UAAU,KAAoB,EAAE;QAGxC,IAAI,IAAI,KAAK,UAAU,EAAE;YACvB,IAAI,MAAM,YAAY,KAAK,EAAE;gBAC3B,MAAM,IAAI,UAAU,CAAC,wCAAwC,CAAC,CAAC;aAChE;YACD,OAAO,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;SAC9B;QAED,IAAI,IAAI,KAAK,YAAY,EAAE;YACzB,IAAI,CAAC,CAAC,MAAM,YAAY,KAAK,CAAC,EAAE;gBAC9B,MAAM,IAAI,UAAU,CAAC,0CAA0C,CAAC,CAAC;aAClE;YACD,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;SAChC;QAED,MAAM,IAAI,UAAU,CAAC,cAAc,IAAI,oBAAoB,CAAC,CAAC;IAC/D,CAAC;CACF;AAwCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AACH,MAAa,iBAAkB,SAAQ,SAAS;IAoB9C,YAAY,IAA2B;QACrC,KAAK,CAAC,IAAI,CAAC,CAAC;QATG,UAAK,GAAG,IAAI,sBAAsB,EAAE,CAAC;QAWpD,IAAI,CAAC,WAAW,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAC5C,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;QAE/B,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,cAAc,IAAI,IAAI,CAAC;QAClD,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,cAAc,IAAI,KAAK,CAAC;QACnD,IAAI,CAAC,kBAAkB,GAAG,IAAI,CAAC,kBAAkB,IAAI,IAAI,CAAC;QAE1D,+DAA+D;QAC/D,qBAAqB;QACrB,MAAM,CAAC,QAAQ,EAAE,WAAW,CAAC,GAAG,cAAc,EAAE,CAAC;QACjD,IAAI,CAAC,YAAY,GAAG,qBAAqB,CACvC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,EAAE,CAAC,CAAC;QAEzC,IAAI,IAAI,CAAC,kBAAkB,EAAE;YAC3B,uEAAuE;YACvE,UAAU;YACV,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,kBAAkB,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;SACrE;QAED,mEAAmE;QACnE,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;QAC/C,MAAM,wBAAwB,GAAG,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;QAEhE,IAAI,CAAC,YAAY,GAAG,qBAAqB,CACvC,SAAS,EAAE,wBAAwB,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3C,IAAI,CAAC,YAAY,GAAG,qBAAqB,CACvC,wBAAwB,EAAE,SAAS,EAAE,EAAE,CAAC,CAAC;QAE3C,0EAA0E;QAC1E,oBAAoB;QACpB,IAAI,CAAC,uBAAuB,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC;QACtD,IAAI,CAAC,UAAU,GAAG,qBAAqB,CACrC,IAAI,CAAC,MAAM,EACX,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,EACrC,IAAI,CAAC,uBAAuB,CAC7B,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAa,UAAU;QACrB,OAAO,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;IACtC,CAAC;IAED;;OAEG;IACH,IAAa,cAAc;QACzB,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC;IAC/B,CAAC;IAED;;OAEG;IACM,SAAS,CAAC,EAAU;QAC3B,uEAAuE;QACvE,sEAAsE;QACtE,2CAA2C;QAC3C,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC;QAC7B,KAAK,MAAM,KAAK,IAAI,IAAI,EAAE;YACxB,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,EAAE,EAAE;gBACtC,OAAO,KAAK,CAAC;aACd;SACF;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACM,SAAS,CAAC,KAAa;QAC9B,OAAO,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;IAEQ,SAAS;QAChB,MAAM,MAAM,GAAG;YACb,UAAU,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;YAClD,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,kBAAkB,EAAE,IAAI,CAAC,kBAAkB;SAC5C,CAAC;QACF,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,eAAe,CACrB,KAAe,EAAE,IAAe;QAEhC,MAAM,QAAQ,GAAG,gBAAgB,CAAC,KAAK,CAAe,CAAC;QAEvD,sBAAsB;QACtB,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC;QAE7D,cAAc;QACd,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvD,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC;QACpD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE;YACtB,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;SACtB;QACD,MAAM,eAAe,GAAG,IAAI;aACzB,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;aACnC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAEzB,MAAM,aAAa,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;QAC7D,MAAM,cAAc,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QAE/D,wCAAwC;QACxC,MAAM,KAAK,GAAe,aAAa,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,GAAG,EAAE,EAAE;YAC/D,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC;YAEzC,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,IAAI,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACxE,CAAC,CAAC,CAAC;QACH,MAAM,eAAe,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAC5C,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,SAAS,GAAG,gBAAgB,CAAC,eAAe,CAAe,CAAC;QAElE,sBAAsB;QACtB,MAAM,WAAW,GAAG,SAAS,CAAC,GAAG,CAC/B,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;QACzD,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CACnC,IAAI,CAAC,EAAE,CAAC,IAAI,KAAK,IAAI,CAAC,uBAAuB,CAAC,CAAC;QAEjD,+DAA+D;QAC/D,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,eAAe,CAAC,OAAO,EAAE,EAAE;YACpD,MAAM,MAAM,GAAG,aAAa,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC;SACtB;QACD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE;YACtB,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;SACtB;QAED,SAAS,MAAM,CAAC,GAAa;YAC3B,OAAO,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;QACrE,CAAC;QAED,MAAM,eAAe,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC;QACzE,MAAM,kBAAkB,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QAEnE,kCAAkC;QAClC,MAAM,eAAe,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QAE/D,MAAM,QAAQ,GAAG,eAAe,CAAC,GAAG,CAClC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,SAAS,GAAG,eAAe,CAAC,GAAG,CACnC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAEpD,MAAM,WAAW,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE;YAC7C,MAAM,KAAK,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;YAC7B,OAAO,GAAG,IAAI,GAAG,KAAK,EAAE,CAAC;QAC3B,CAAC,CAAC,CAAC;QACH,MAAM,sBAAsB,GAAG,IAAI;aAChC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC;aACpB,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QAEjC,MAAM,iBAAiB,GAAG,sBAAsB,CAAC,GAAG,CAClD,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,kBAAkB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,kBAAkB,GAAG,sBAAsB,CAAC,GAAG,CACnD,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAExD,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,iBAAiB,CAAC,OAAO,EAAE,EAAE;YACxD,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,GAAG,OAAO,CAAC;YACnC,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;YACpC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,GAAG,UAAU,CAAC;SACzC;QAED,KAAK,MAAM,OAAO,IAAI,kBAAkB,EAAE;YACxC,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,GAAG,OAAO,CAAC;YACnC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC;SACjC;QAED,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAC3C,KAAK,GAAG,uBAAuB,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAE3C,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACvB,CAAC;IAED;;OAEG;IACK,QAAQ,CAAC,KAAe;QAC9B,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC;QAE9B,eAAe;QACf,SAAS,aAAa,CAAC,IAAe;YACpC,OAAO,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC3B,CAAC;QAED,MAAM,WAAW,GAAc,KAAK,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE1D,IAAI,WAAW,GAAG,KAAK,CAAC;QACxB,IAAI,IAAI,GAAG,WAAW,CAAC;QACvB,OAAO,aAAa,CAAC,IAAI,CAAC,EAAE;YAC1B,CAAC,WAAW,EAAE,IAAI,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;SAC/D;QAED,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,MAAc;QACnC,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,CAAa,CAAC;QAElD,MAAM,UAAU,GAAG,SAAS,CAAC,GAAG,CAC9B,KAAK,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9D,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAE1D,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,MAAc;QAC3C,MAAM,KAAK,GAAG,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,oBAAoB,GAAG,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QAClD,MAAM,cAAc,GAAG,gBAAgB,CAAC,oBAAoB,CAAe,CAAC;QAE5E,qDAAqD;QACrD,4DAA4D;QAC5D,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QAEhE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IAC1C,CAAC;IAED,QAAQ,CAAC,MAAc;QACrB,OAAO,IAAI,CAAC,GAAG,EAAE;YACf,IAAI,IAAI,CAAC,cAAc,EAAE;gBACvB,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,CAAa,CAAC;gBAClD,MAAM,GAAG,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC;aACpD;YAED,MAAM,eAAe,GACnB,kBAAkB,CAAC,MAAM,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACtD,MAAM,SAAS,GAAG,gBAAgB,CAAC,eAAe,CAAe,CAAC;YAElE,MAAM,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC;YAC3B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,SAAS,CAAC,OAAO,EAAE,EAAE;gBAC9C,cAAc,CAAC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;aACzD;YAED,MAAM,UAAU,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAEnE,eAAe;YACf,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;YAClD,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC;YAEjD,MAAM,cAAc,GAAG,SAAS,CAAC,IAAI,CACnC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,IAAI,UAAU,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC;YAEjD,MAAM,mBAAmB,GAAG,GAAc,EAAE;gBAC1C,MAAM,YAAY,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC;gBACnE,IAAI,CAAC,sBAAsB,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC;gBAClD,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;YACvC,CAAC,CAAC;YAEF,uEAAuE;YACvE,wEAAwE;YACxE,MAAM,cAAc,GAClB,cAAc,CAAC,CAAC,CAAC,mBAAmB,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC;YAEvD,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAC3C,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACvD,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;YAE/D,4BAA4B;YAC5B,MAAM,iBAAiB,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,EAAE;gBAC3C,iBAAiB,CAAC,IAAI,CAAC,iBAAiB,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;aAC/D;YACD,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACnE,MAAM,eAAe,GACnB,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC,CAAC;YAExD,IAAI,QAAQ,GAAa,EAAE,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE;gBACnD,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC,GAAC,CAAC,CAAC,CAAC,CAAC;gBAChE,MAAM,GAAG,GAAG,aAAa,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;gBAC5C,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;aAC5B;YAED,wDAAwD;YACxD,IAAI,IAAI,CAAC,cAAc,EAAE;gBACvB,kBAAkB;gBAClB,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;oBAC1B,IAAI,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,cAAc,EAAE;wBAClC,OAAO,CAAC,CAAC;qBACV;yBAAM,IAAI,CAAC,CAAC,IAAI,GAAG,IAAI,CAAC,cAAc,EAAE;wBACvC,OAAO,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,cAAc,CAAC,CAAC;qBACxC;yBAAM;wBACL,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,cAAc,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;qBACnD;gBACH,CAAC,CAAC,CAAC;aACJ;YAED,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC,CAAC;IACL,CAAC;IAEQ,UAAU,CAAC,MAAgB;QAClC,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,MAAM,CAAC;aACjD,GAAG,CAAC,CAAC,CAAC,EAAE,CAAE,WAAW,CAAC,CAAC,CAAc,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAEnD,OAAO,MAAM,CAAC,WAAW,CAAC,CAAC;IAC7B,CAAC;;AAhVD,kBAAkB;AACF,2BAAS,GAAG,mBAAmB,AAAtB,CAAuB;SAFrC,iBAAiB;AAmV9B,aAAa,CAAC,aAAa,CAAC,iBAAiB,CAAC,CAAC","sourcesContent":["/**\n * @license\n * Copyright 2023 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n/**\n *  Tokenizer layers.\n */\n\n/* Original source: keras-nlp/tokenizer.py */\nimport { Tensor, serialization, tensor, tidy} from '@tensorflow/tfjs-core';\n\nimport { Layer, LayerArgs } from '../../engine/topology';\nimport { NotImplementedError, ValueError } from '../../errors';\nimport { BytePairTokenizerCache, StaticHashTable, bytesToUnicode, createStaticHashtable, removeStringsFromInputs, splitStringsForBpe } from './tokenizers_utils';\nimport { tensorToArr, tensorArrTo2DArr } from './utils';\n\nexport declare interface TokenizerOptions {\n  mode?: 'tokenize' | 'detokenize';\n}\n\n/**\n * Base class for Tokenizers.\n *\n *  Tokenizers in the tfjs library should all subclass this layer.\n *  The class provides two core methods `tokenize()` and `detokenize()` for\n *  going from plain text to sequences and back. A tokenizer is a subclass of\n *  `Layer` and can be combined with other layers in a `tf.sequential` model.\n *\n *  Subclassers should always implement the `tokenize()` method, which will also\n *  be the default when calling the layer directly on inputs.\n *\n *  Subclassers can optionally implement the `detokenize()` method if the\n *  tokenization is reversible. Otherwise, this can be skipped.\n *\n *  Subclassers should implement `get_vocabulary()`, `vocabulary_size()`,\n *  `token_to_id()` and `id_to_token()` if applicable. For some simple\n *  \"vocab free\" tokenizers, such as a whitespace splitter shown below, these\n *  methods do not apply and can be skipped.\n *\n *  Example:\n *\n *  ```js\n *  class WhitespaceSplitterTokenizer extends Tokenizer {\n *    tokenize(inputs: Tensor): Tensor[] {\n *      const stringInputs = inputs.dataSync() as unknown as string[];\n *      return stringInputs.map(input => Tensor(input.split(' ')));\n *    }\n *\n *    override detokenize(inputs: Tensor[]): Tensor {\n *      const stringInputs = inputs.map(\n *        input => input.dataSync() as unknown as string[]);\n *      return Tensor(stringInputs.map(str => str.join(' ')));\n *    }\n *  }\n *\n * const tokenizer = new WhitespaceSplitterTokenizer();\n *\n * tokenizer.tokenize(tensor(['this is a test']))[0].print();\n *\n * tokenizer.detokenize([tensor(['this', 'is', 'a', 'test'])]).print();\n * ```\n */\nexport abstract class Tokenizer extends Layer {\n  /**\n   * Transform input tensors of strings into output tokens.\n   *\n   * @param inputs Input tensor.\n   * @param kwargs Additional keyword arguments.\n   */\n  abstract tokenize(inputs: Tensor): Tensor[];\n\n  /**\n   * Transform tokens back into strings.\n   *\n   * @param inputs Input tensor.\n   * @param kwargs Additional keyword arguments.\n   */\n  detokenize(inputs: Tensor[]): Tensor {\n    throw new NotImplementedError(\n      `No implementation of 'detokenize()' was found for\n      ${this.constructor.name}.`\n    );\n  }\n\n  /**\n   * Get the tokenizer vocabulary as a list of strings terms.\n   */\n  get vocabulary(): string[] {\n    throw new NotImplementedError(\n      `No implementation of 'vocabulary()' was found for\n      ${this.constructor.name}.`\n    );\n  }\n\n  /**\n   * Returns the total size of the token id space.\n   */\n  get vocabularySize(): number {\n    throw new NotImplementedError(\n      `No implementation of 'vocabularySize()' was found for\n      ${this.constructor.name}.`\n    );\n  }\n\n  /**\n   * Convert an integer id to a string token.\n   */\n  idToToken(id: number): string {\n    throw new NotImplementedError(\n      `No implementation of 'idToToken()' was found for\n      ${this.constructor.name}.`\n    );\n  }\n\n  /**\n   * Convert an integer id to a string token.\n   */\n  tokenToId(token: string): number {\n    throw new NotImplementedError(\n      `No implementation of 'tokenToId()' was found for\n      ${this.constructor.name}.`\n    );\n  }\n\n  override call(\n    inputs: Tensor|Tensor[],\n    {mode = 'tokenize'}: TokenizerOptions={}\n  ): Tensor|Tensor[] {\n\n    if (mode === 'tokenize') {\n      if (inputs instanceof Array) {\n        throw new ValueError(`tokenize expects Tensor, not Tensor[].`);\n      }\n      return this.tokenize(inputs);\n    }\n\n    if (mode === 'detokenize') {\n      if (!(inputs instanceof Array)) {\n        throw new ValueError(`detokenize expects Tensor[], not Tensor.`);\n      }\n      return this.detokenize(inputs);\n    }\n\n    throw new ValueError(`Input mode=${mode} is not supported.`);\n  }\n}\n\n/* Original source: keras-nlp/byte_pair_tokenizer.py */\n// TODO(pforderique): Support filename string inputs for vocabulary and merges.\nexport declare interface BytePairTokenizerArgs extends LayerArgs {\n  /**\n   * Maps token to integer ids\n   */\n  vocabulary: Map<string, number>;\n\n  /**\n   * Array. Contains the merge rule.\n   */\n  merges: string[];\n\n  /**\n   * Integer. If set, the output will be padded or truncated to the\n   * `sequenceLength`. Defaults to `null`.\n   */\n  sequenceLength?: number;\n\n  /**\n   * Boolean. Whether to add an initial space to the input. This tokenizer is\n   * whitespace aware, and will tokenize a word with a leading space\n   * differently. Adding a prefix space to the first word will cause it to be\n   * tokenized equivalently to all subsequent words in the sequence.\n   * Defaults to `false`.\n   */\n  addPrefixSpace?: boolean;\n\n  /**\n   * Array. A list of strings that will never be split during the word-level\n   * splitting applied before the byte-pair encoding. This can be used to ensure\n   * special tokens map to unique indices in the vocabulary, even if these\n   * special tokens contain splittable characters such as punctuation. Special\n   * tokens must still be included in `vocabulary`. Defaults to `None`.\n   */\n  unsplittableTokens?: string[];\n}\n\n/**\n * Byte-pair encoding tokenizer layer.\n *\n * This BPE tokenizer provides the same functionality as the official GPT-2\n * tokenizer. Given the same `vocabulary` which maps tokens to ids, and `merges`\n * which describes BPE merge rules, it should provide the same output as OpenAI\n * implementation (https://github.com/openai/gpt-2/blob/master/src/encoder.py).\n *\n * If input is a batch of strings (rank > 0):\n * By default, the layer will output a `Tensor[]`.\n * If `sequenceLength` is set, the layer will output a `Tensor[]` where all\n * inputs have been padded or truncated to `sequenceLength`.\n *\n * Examples:\n *\n * Tokenize\n * ```js\n * const vocabulary = new Map([['butter', 1], ['fly', 2]]);\n * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y'];\n * const tokenizer = new BytePairTokenizer({vocabulary, merges});\n *\n * tokenizer.tokenize(tensor(['butterfly']))[0].print();\n * tokenizer.tokenize(tensor(['butterfly, butter']))[1].print();\n * ```\n *\n * Detokenize\n * ```js\n * const vocabulary = new Map([['butter', 1], ['fly', 2]]);\n * const merges = ['b u', 't t', 'e r', 'bu tt', 'butt er', 'f l', 'fl y'];\n * const tokenizer = new BytePairTokenizer({vocabulary, merges});\n *\n * tokenizer.detokenize([[1, 2]]).print();\n * ```\n */\nexport class BytePairTokenizer extends Tokenizer {\n  /** @nocollapse */\n  static readonly className = 'BytePairTokenizer';\n\n  private _vocabulary: Map<string, number>;\n  private merges: string[];\n\n  private readonly sequenceLength: number;\n  private readonly addPrefixSpace: boolean;\n  private readonly unsplittableTokens: string[];\n\n  private readonly byte2Unicode: StaticHashTable<number, string>;\n  private readonly cache = new BytePairTokenizerCache();\n\n  private readonly tokenToIdMap: StaticHashTable<string, number>;\n  private readonly idToTokenMap: StaticHashTable<number, string>;\n\n  private readonly mergeRanksLookupDefault: number;\n  private readonly mergeRanks: StaticHashTable<string, number>;\n\n  constructor(args: BytePairTokenizerArgs) {\n    super(args);\n\n    this._vocabulary = new Map(args.vocabulary);\n    this.merges = [...args.merges];\n\n    this.sequenceLength = args.sequenceLength || null;\n    this.addPrefixSpace = args.addPrefixSpace || false;\n    this.unsplittableTokens = args.unsplittableTokens || null;\n\n    // Create byte <=> unicode mapping. This is useful for handling\n    // whitespace tokens.\n    const [byteList, unicodeList] = bytesToUnicode();\n    this.byte2Unicode = createStaticHashtable(\n      Array.from(byteList), unicodeList, '');\n\n    if (this.unsplittableTokens) {\n      // Put unsplittable tokens into cache, so it won't be further split and\n      // merged.\n      this.cache.insert(this.unsplittableTokens, this.unsplittableTokens);\n    }\n\n    // Create mapping between string tokens to int ids, and vice versa.\n    const bytePairs = [...this._vocabulary.keys()];\n    const bytePairEncodingIndicies = [...this._vocabulary.values()];\n\n    this.tokenToIdMap = createStaticHashtable(\n      bytePairs, bytePairEncodingIndicies, -1);\n\n    this.idToTokenMap = createStaticHashtable(\n      bytePairEncodingIndicies, bytePairs, '');\n\n    // Create ranking of merge rules, this is the same as order of merge pairs\n    // in `this.merges`.\n    this.mergeRanksLookupDefault = this.merges.length + 1;\n    this.mergeRanks = createStaticHashtable(\n      this.merges,\n      [...Array(this.merges.length).keys()],\n      this.mergeRanksLookupDefault\n    );\n  }\n\n  /**\n   * Get the tokenizer vocabulary as a list of string tokens.\n   */\n  override get vocabulary(): string[] {\n    return [...this._vocabulary.keys()];\n  }\n\n  /**\n   * Get the size of the tokenizer vocabulary.\n   */\n  override get vocabularySize(): number {\n    return this._vocabulary.size;\n  }\n\n  /**\n   * Convert an integer id to a string token.\n   */\n  override idToToken(id: number): string | undefined {\n    // This will be slow, but keep memory usage down compared to building a\n    // dict. Assuming the main use case is looking up a few special tokens\n    // early in the vocab, this should be fine.\n    const keys = this.vocabulary;\n    for (const token of keys) {\n      if (this._vocabulary.get(token) === id) {\n        return token;\n      }\n    }\n    return undefined;\n  }\n\n  /**\n   * Convert a string token to an integer id.\n   */\n  override tokenToId(token: string): number | undefined {\n    return this._vocabulary.get(token);\n  }\n\n  override getConfig(): serialization.ConfigDict {\n    const config = {\n      vocabulary: Array.from(this._vocabulary.entries()),\n      merges: this.merges,\n      sequenceLength: this.sequenceLength,\n      addPrefixSpace: this.addPrefixSpace,\n      unsplittableTokens: this.unsplittableTokens,\n    };\n    const baseConfig = super.getConfig();\n    Object.assign(config, baseConfig);\n    return config;\n  }\n\n  /**\n   * Perform one step of byte-pair merge.\n   */\n  private bpeMergeOneStep(\n    words: Tensor[], mask: boolean[]): [Tensor[], boolean[]] {\n\n    const wordsStr = tensorArrTo2DArr(words) as string[][];\n\n    // Get all word pairs.\n    const first = wordsStr.map(arr => arr.slice(0, -1));\n    const second = wordsStr.map(arr => arr.slice(1, arr.length));\n\n    // Mask empty.\n    const nonEmptyMask = second.map(arr => arr.length > 0);\n    mask = mask.map((a, idx) => a && nonEmptyMask[idx]);\n    if (!mask.some(e => e)) {\n      return [words, mask];\n    }\n    const nonEmptyIndices = mask\n      .map((bool, idx) => bool ? idx : -1)\n      .filter(e => e !== -1);\n\n    const filteredFirst = nonEmptyIndices.map(idx => first[idx]);\n    const filteredSecond = nonEmptyIndices.map(idx => second[idx]);\n\n    // Get byte pair ranking in merge rules.\n    const pairs: string[][] = filteredFirst.map((firstSubArr, idx) => {\n      const secondSubArr = filteredSecond[idx];\n\n      return firstSubArr.map((char, idx) => `${char} ${secondSubArr[idx]}`);\n    });\n    const pairRanksTensor = this.mergeRanks.lookup(\n      pairs.map(arr => tensor(arr)));\n    const pairRanks = tensorArrTo2DArr(pairRanksTensor) as number[][];\n\n    // Get BPE pair ranks.\n    const minPairRank = pairRanks.map(\n      arr => arr.reduce((a, b) => Math.min(a, b), Infinity));\n    const pairFoundMask = minPairRank.map(\n      rank => rank !== this.mergeRanksLookupDefault);\n\n    // Tokens that cannot be further merged are marked as finished.\n    for (const [idx, index] of nonEmptyIndices.entries()) {\n      const update = pairFoundMask[idx];\n      mask[index] = update;\n    }\n    if (!mask.some(e => e)) {\n      return [words, mask];\n    }\n\n    function argMin(arr: number[]): number {\n      return arr.indexOf(arr.reduce((a, b) => Math.min(a, b), Infinity));\n    }\n\n    const maskedPairRanks = pairRanks.filter((_, idx) => pairFoundMask[idx]);\n    const minPairRankIndices = maskedPairRanks.map(arr => argMin(arr));\n\n    // Get words and pairs to process.\n    const unfinishedWords = wordsStr.filter((_, idx) => mask[idx]);\n\n    const pairLeft = unfinishedWords.map(\n      (word, idx) => word[minPairRankIndices[idx]]);\n\n    const pairRight = unfinishedWords.map(\n      (word, idx) => word[minPairRankIndices[idx] + 1]);\n\n    const mergedPairs = pairLeft.map((left, idx) => {\n      const right = pairRight[idx];\n      return `${left}${right}`;\n    });\n    const unfinishedWordsIndices = mask\n      .map((_, idx) => idx)\n      .filter((_, idx) => mask[idx]);\n\n    const mergedPairIndices = unfinishedWordsIndices.map(\n      (index, idx) => [index, minPairRankIndices[idx]]);\n    const emptyStringIndices = unfinishedWordsIndices.map(\n      (index, idx) => [index, minPairRankIndices[idx] + 1]);\n\n    for (const [idx, indices] of mergedPairIndices.entries()) {\n      const [wordIdx, charIdx] = indices;\n      const mergedPair = mergedPairs[idx];\n      wordsStr[wordIdx][charIdx] = mergedPair;\n    }\n\n    for (const indices of emptyStringIndices) {\n      const [wordIdx, charIdx] = indices;\n      wordsStr[wordIdx][charIdx] = '';\n    }\n\n    words = wordsStr.map(word => tensor(word));\n    words = removeStringsFromInputs(words, '');\n\n    return [words, mask];\n  }\n\n  /**\n   * Perform byte-pair merge for each word in the inputs.\n   */\n  private bpeMerge(words: Tensor[]): Tensor[] {\n    const numWords = words.length;\n\n    // Merge bytes.\n    function loopCondition(mask: boolean[]): boolean {\n      return mask.some(e => e);\n    }\n\n    const initialMask: boolean[] = Array(numWords).fill(true);\n\n    let mergedWords = words;\n    let mask = initialMask;\n    while (loopCondition(mask)) {\n      [mergedWords, mask] = this.bpeMergeOneStep(mergedWords, mask);\n    }\n\n    return mergedWords;\n  }\n\n  /**\n   * Map token bytes to unicode using `byte2unicode`.\n   */\n  private transformBytes(tokens: Tensor): Tensor[] {\n    const tokensStr = tensorToArr(tokens) as string[];\n\n    const splitBytes = tokensStr.map(\n      token => tensor(token.split('').map(c => c.charCodeAt(0))));\n    const splitUnicode = this.byte2Unicode.lookup(splitBytes);\n\n    return splitUnicode;\n  }\n\n  /**\n   * Process unseen tokens and add to cache.\n   */\n  private bpeMergeAndUpdateCache(tokens: Tensor) {\n    const words = this.transformBytes(tokens);\n    const tokenizedWordsTensor = this.bpeMerge(words);\n    const tokenizedWords = tensorArrTo2DArr(tokenizedWordsTensor) as string[][];\n\n    // For each word, join all its token by a whitespace,\n    // e.g., [\"dragon\", \"fly\"] => \"dragon fly\" for hash purpose.\n    const joinedTokens = tokenizedWords.map(word => word.join(' '));\n\n    this.cache.insert(tokens, joinedTokens);\n  }\n\n  tokenize(inputs: Tensor): Tensor[] {\n    return tidy(() => {\n      if (this.addPrefixSpace) {\n        const strInputs = tensorToArr(inputs) as string[];\n        inputs = tensor(strInputs.map(word => ' ' + word));\n      }\n\n      const rawTokensTensor =\n        splitStringsForBpe(inputs, this.unsplittableTokens);\n      const rawTokens = tensorArrTo2DArr(rawTokensTensor) as string[][];\n\n      const tokenRowSplits = [0];\n      for (const [idx, token] of rawTokens.entries()) {\n        tokenRowSplits.push(tokenRowSplits[idx] + token.length);\n      }\n\n      const flatTokens = rawTokens.reduce((acc, e) => acc.concat(e), []);\n\n      // Check cache.\n      const cacheLookup = this.cache.lookup(flatTokens);\n      const cacheMask = cacheLookup.map(e => e === '');\n\n      const hasUnseenWords = cacheMask.some(\n        (bool, idx) => bool && flatTokens[idx] !== '');\n\n      const processUnseenTokens = (): string[]  => {\n        const unseenTokens = flatTokens.filter((_, idx) => cacheMask[idx]);\n        this.bpeMergeAndUpdateCache(tensor(unseenTokens));\n        return this.cache.lookup(flatTokens);\n      };\n\n      // If `has_unseen_words == True`, it means not all tokens are in cache,\n      // we will process the unseen tokens. Otherwise return the cache lookup.\n      const tokenizedWords =\n        hasUnseenWords ? processUnseenTokens() : cacheLookup;\n\n      const tokensTensor = this.tokenToIdMap.lookup(\n        tokenizedWords.map(word => tensor(word.split(' '))));\n      const tokens = tokensTensor.map(t => Array.from(t.dataSync()));\n\n      // Unflatten to match input.\n      const newTokenRowSplits = [0];\n      for (const [idx, token] of tokens.entries()) {\n        newTokenRowSplits.push(newTokenRowSplits[idx] + token.length);\n      }\n      const newFlatTokens = tokens.reduce((acc, e) => acc.concat(e), []);\n      const gatheredIndices =\n        tokenRowSplits.map(index => newTokenRowSplits[index]);\n\n      let tokens2D: Tensor[] = [];\n      for (let i = 0; i < gatheredIndices.length - 1; i++) {\n        const [start, end] = [gatheredIndices[i], gatheredIndices[i+1]];\n        const row = newFlatTokens.slice(start, end);\n        tokens2D.push(tensor(row));\n      }\n\n      // Convert to a dense output if `sequenceLength` is set.\n      if (this.sequenceLength) {\n        // pad or truncate\n        tokens2D = tokens2D.map(t => {\n          if (t.size === this.sequenceLength) {\n            return t;\n          } else if (t.size > this.sequenceLength) {\n            return t.slice(0, this.sequenceLength);\n          } else {\n            return t.pad([[0, this.sequenceLength - t.size]]);\n          }\n        });\n      }\n\n      return tokens2D;\n    });\n  }\n\n  override detokenize(inputs: Tensor[]): Tensor {\n    const unicodeText = this.idToTokenMap.lookup(inputs)\n      .map(t => (tensorToArr(t) as string[]).join(''));\n\n    return tensor(unicodeText);\n  }\n}\nserialization.registerClass(BytePairTokenizer);\n"]}