gx
chenyc
2025-06-12 7b72ac13a83764a662159d4a49b7fffb90476ecb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/**
 * @license
 * Copyright 2023 Google LLC.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =============================================================================
 */
/**
 * GPT-2 preprocessor layer.
 */
/* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */
import { serialization, tidy } from '@tensorflow/tfjs-core';
import { Preprocessor } from '../preprocessor';
import { GPT2Tokenizer } from './gpt2_tokenizer';
import { StartEndPacker } from '../../preprocessing/start_end_packer';
import { ValueError } from '../../../../errors';
export function packXYSampleWeight(x, y, sampleWeight) {
    if (y === undefined) {
        return x;
    }
    else if (sampleWeight === undefined) {
        return [x, y];
    }
    else {
        return [x, y, sampleWeight];
    }
}
/**
 * GPT2 preprocessing layer which tokenizes and packs inputs.
 *
 * This preprocessing layer will do 2 things:
 *
 * - Tokenize the inputs using the `tokenizer`.
 * - Construct a dictionary with keys `"tokenIds"`, `"paddingMask"`, that can
 *     be passed directly to a `GPT2Backbone`.
 *
 * The call method of this layer accepts three arguments, `x`, `y`, and
 * `sampleWeight`. `x` can be a string or tensor representing a single
 * segment, a list of strings representing a batch of single segments,
 * or a list of tensors representing multiple segments to be packed together.
 * `y` and `sampleWeight` are both optional, can have any format, and will be
 * passed through unaltered.
 *
 * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is
 * mainly used for generation tasks. For tasks having multi-segment inputs
 * like "glue/mnli", please use a model designed for classification purposes
 * such as BERT or RoBERTa.
 *
 * Examples:
 *
 * Directly calling the layer on data.
 * ```js
 * const features =  ['a quick fox.', 'a fox quick.'];
 * const vocabulary =
 *    new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);
 * const merges =
 *    ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];
 * const tokenizer = GPT2Tokenizer({vocabulary, merges});
 *
 * const preprocessor = GPT2Preprocessor({tokenizer});
 * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();
 * ```
 */
class GPT2Preprocessor extends Preprocessor {
    constructor(args) {
        var _a, _b, _c;
        super(args);
        this.tokenizer = args.tokenizer;
        this.sequenceLength = (_a = args.sequenceLength) !== null && _a !== void 0 ? _a : 1024;
        this.addStartToken = (_b = args.addStartToken) !== null && _b !== void 0 ? _b : true;
        this.addEndToken = (_c = args.addEndToken) !== null && _c !== void 0 ? _c : true;
        const gpt2Tokenizer = this.tokenizer;
        this.packer = new StartEndPacker({
            startValue: gpt2Tokenizer.startTokenId,
            endValue: gpt2Tokenizer.endTokenId,
            padValue: gpt2Tokenizer.padTokenId,
            sequenceLength: this.sequenceLength,
        });
    }
    getConfig() {
        const config = {
            sequenceLength: this.sequenceLength,
            addStartToken: this.addStartToken,
            addEndToken: this.addEndToken,
        };
        const baseConfig = super.getConfig();
        Object.assign(config, baseConfig);
        return config;
    }
    call(inputs, kwargs) {
        return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds;
    }
    callAndReturnPaddingMask(inputs, kwargs) {
        return tidy(() => {
            var _a;
            if (inputs instanceof Array) {
                if (inputs.length !== 1) {
                    throw new ValueError('GPT2 requires each input feature to contain only ' +
                        `one segment, but received ${inputs.length}. If you are using ` +
                        'GPT2 for a multi-segment classification task, please refer to ' +
                        'classification models like BERT or RoBERTa.');
                }
                inputs = inputs[0];
            }
            const sequenceLength = (_a = kwargs.sequenceLength) !== null && _a !== void 0 ? _a : this.sequenceLength;
            const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(this.tokenizer.call(inputs), {
                sequenceLength,
                addStartValue: this.addStartToken,
                addEndValue: this.addEndToken
            });
            return {
                tokenIds: tokenIds,
                paddingMask: paddingMask
            };
        });
    }
    /**
     * Calls the layer and returns extra information like the paddingMask used to
     * pack the sequence, the label data, and the sample weights used.
     */
    callAndPackArgs(inputs, kwargs) {
        const x = this.callAndReturnPaddingMask(inputs, kwargs);
        return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight);
    }
    static tokenizerCls(cls) {
        return GPT2Tokenizer;
    }
}
/** @nocollapse */
GPT2Preprocessor.className = 'GPT2Preprocessor';
export { GPT2Preprocessor };
serialization.registerClass(GPT2Preprocessor);
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"gpt2_preprocessor.js","sourceRoot":"","sources":["../../../../../../../../../tfjs-layers/src/layers/nlp/models/gpt2/gpt2_preprocessor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH;;GAEG;AAEH,iEAAiE;AACjE,OAAO,EAAoC,aAAa,EAAE,IAAI,EAAE,MAAM,uBAAuB,CAAC;AAG9F,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AACtE,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AA8ChD,MAAM,UAAU,kBAAkB,CAChC,CAAiB,EAAE,CAAU,EAAE,YAAqB;IAKpD,IAAI,CAAC,KAAK,SAAS,EAAE;QACnB,OAAO,CAAC,CAAC;KACV;SAAM,IAAI,YAAY,KAAK,SAAS,EAAE;QACrC,OAAO,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;KACf;SAAM;QACL,OAAO,CAAC,CAAC,EAAE,CAAC,EAAE,YAAY,CAAC,CAAC;KAC7B;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,MAAa,gBAAiB,SAAQ,YAAY;IAShD,YAAY,IAA0B;;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC;QACZ,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QAChC,IAAI,CAAC,cAAc,GAAG,MAAA,IAAI,CAAC,cAAc,mCAAI,IAAI,CAAC;QAClD,IAAI,CAAC,aAAa,GAAG,MAAA,IAAI,CAAC,aAAa,mCAAI,IAAI,CAAC;QAChD,IAAI,CAAC,WAAW,GAAG,MAAA,IAAI,CAAC,WAAW,mCAAI,IAAI,CAAC;QAE5C,MAAM,aAAa,GAAG,IAAI,CAAC,SAA0B,CAAC;QACtD,IAAI,CAAC,MAAM,GAAG,IAAI,cAAc,CAAC;YAC/B,UAAU,EAAE,aAAa,CAAC,YAAY;YACtC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,QAAQ,EAAE,aAAa,CAAC,UAAU;YAClC,cAAc,EAAE,IAAI,CAAC,cAAc;SACpC,CAAC,CAAC;IACL,CAAC;IAEQ,SAAS;QAChB,MAAM,MAAM,GAAG;YACb,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,aAAa,EAAE,IAAI,CAAC,aAAa;YACjC,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC;QACF,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IAEQ,IAAI,CACX,MAAuB,EAAE,MAA+B;QACxD,OAAO,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC;IAChE,CAAC;IAEO,wBAAwB,CAC9B,MAAuB,EACvB,MAA+B;QAE/B,OAAO,IAAI,CAAC,GAAG,EAAE;;YACf,IAAI,MAAM,YAAY,KAAK,EAAE;gBAC3B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE;oBACvB,MAAM,IAAI,UAAU,CAClB,mDAAmD;wBACnD,6BAA6B,MAAM,CAAC,MAAM,qBAAqB;wBAC/D,gEAAgE;wBAChE,6CAA6C,CAC9C,CAAC;iBACH;gBACD,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;aACpB;YAED,MAAM,cAAc,GAAG,MAAA,MAAM,CAAC,cAAc,mCAAI,IAAI,CAAC,cAAc,CAAC;YACpE,MAAM,CAAC,QAAQ,EAAE,WAAW,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,wBAAwB,CAClE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAC3B;gBACE,cAAc;gBACd,aAAa,EAAE,IAAI,CAAC,aAAa;gBACjC,WAAW,EAAE,IAAI,CAAC,WAAW;aAC9B,CACF,CAAC;YAEF,OAAO;gBACL,QAAQ,EAAE,QAAoB;gBAC9B,WAAW,EAAE,WAAuB;aACrC,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,eAAe,CAAC,MAAuB,EAAE,MAA+B;QAItE,MAAM,CAAC,GAAG,IAAI,CAAC,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxD,OAAO,kBAAkB,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,EAAE,MAAM,CAAC,YAAY,CAAC,CAAC;IAC9D,CAAC;IAED,MAAM,CAAU,YAAY,CAC1B,GAA6C;QAC7C,OAAO,aAAa,CAAC;IACvB,CAAC;;AAzFD,kBAAkB;AACF,0BAAS,GAAG,kBAAkB,CAAC;SAFpC,gBAAgB;AA4F7B,aAAa,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC","sourcesContent":["/**\n * @license\n * Copyright 2023 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n/**\n * GPT-2 preprocessor layer.\n */\n\n/* Original source: keras-nlp/models/gpt2/gpt2_preprocessor.py */\nimport { NamedTensorMap, Tensor, Tensor2D, serialization, tidy } from '@tensorflow/tfjs-core';\n\nimport { LayerArgs } from '../../../../engine/topology';\nimport { Preprocessor } from '../preprocessor';\nimport { GPT2Tokenizer } from './gpt2_tokenizer';\nimport { StartEndPacker } from '../../preprocessing/start_end_packer';\nimport { ValueError } from '../../../../errors';\n\nexport declare interface GPT2PreprocessorArgs extends LayerArgs {\n  /**\n   * A GPT2Tokenizer instance.\n   */\n  tokenizer: GPT2Tokenizer;\n\n  /**\n   * The length of the packed inputs.\n   * Defaults to 1024.\n   */\n  sequenceLength?: number;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer start token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addStartToken?: boolean;\n\n  /**\n   * If `true`, the preprocessor will prepend the tokenizer end token to each\n   * input sequence.\n   * Defaults to `true`.\n   */\n  addEndToken?: boolean;\n}\n\nexport declare interface GPT2PreprocessorOptions {\n  /**\n   * Any label data. Will be passed through unaltered.\n   */\n  y?: Tensor;\n\n  /**\n   * Any label weight data. Will be passed through unaltered.\n   */\n  sampleWeight?: Tensor;\n\n  /**\n   * Pass to override the configured `sequenceLength` of the layer.\n   */\n  sequenceLength?: number;\n}\n\nexport function packXYSampleWeight(\n  x: NamedTensorMap, y?: Tensor, sampleWeight?: Tensor):\n  NamedTensorMap\n  | [NamedTensorMap, Tensor]\n  | [NamedTensorMap, Tensor, Tensor] {\n\n  if (y === undefined) {\n    return x;\n  } else if (sampleWeight === undefined) {\n    return [x, y];\n  } else {\n    return [x, y, sampleWeight];\n  }\n}\n\n/**\n * GPT2 preprocessing layer which tokenizes and packs inputs.\n *\n * This preprocessing layer will do 2 things:\n *\n * - Tokenize the inputs using the `tokenizer`.\n * - Construct a dictionary with keys `\"tokenIds\"`, `\"paddingMask\"`, that can\n *     be passed directly to a `GPT2Backbone`.\n *\n * The call method of this layer accepts three arguments, `x`, `y`, and\n * `sampleWeight`. `x` can be a string or tensor representing a single\n * segment, a list of strings representing a batch of single segments,\n * or a list of tensors representing multiple segments to be packed together.\n * `y` and `sampleWeight` are both optional, can have any format, and will be\n * passed through unaltered.\n *\n * `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is\n * mainly used for generation tasks. For tasks having multi-segment inputs\n * like \"glue/mnli\", please use a model designed for classification purposes\n * such as BERT or RoBERTa.\n *\n * Examples:\n *\n * Directly calling the layer on data.\n * ```js\n * const features =  ['a quick fox.', 'a fox quick.'];\n * const vocabulary =\n *    new Map([['<|endoftext|>', 0], ['a', 4], ['Ġquick', 5], ['Ġfox', 6]]);\n * const merges =\n *    ['Ġ q', 'u i', 'c k', 'ui ck', 'Ġq uick', 'Ġ f', 'o x', 'Ġf ox'];\n * const tokenizer = GPT2Tokenizer({vocabulary, merges});\n *\n * const preprocessor = GPT2Preprocessor({tokenizer});\n * preprocessor.call(tensor(['the quick brown fox jumped.']))[0].print();\n * ```\n */\nexport class GPT2Preprocessor extends Preprocessor {\n  /** @nocollapse */\n  static override className = 'GPT2Preprocessor';\n\n  protected readonly sequenceLength: number;\n  protected readonly addStartToken: boolean;\n  protected readonly addEndToken: boolean;\n  protected readonly packer: StartEndPacker;\n\n  constructor(args: GPT2PreprocessorArgs) {\n    super(args);\n    this.tokenizer = args.tokenizer;\n    this.sequenceLength = args.sequenceLength ?? 1024;\n    this.addStartToken = args.addStartToken ?? true;\n    this.addEndToken = args.addEndToken ?? true;\n\n    const gpt2Tokenizer = this.tokenizer as GPT2Tokenizer;\n    this.packer = new StartEndPacker({\n      startValue: gpt2Tokenizer.startTokenId,\n      endValue: gpt2Tokenizer.endTokenId,\n      padValue: gpt2Tokenizer.padTokenId,\n      sequenceLength: this.sequenceLength,\n    });\n  }\n\n  override getConfig(): serialization.ConfigDict {\n    const config = {\n      sequenceLength: this.sequenceLength,\n      addStartToken: this.addStartToken,\n      addEndToken: this.addEndToken,\n    };\n    const baseConfig = super.getConfig();\n    Object.assign(config, baseConfig);\n    return config;\n  }\n\n  override call(\n    inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions): Tensor|Tensor[] {\n    return this.callAndReturnPaddingMask(inputs, kwargs).tokenIds;\n  }\n\n  private callAndReturnPaddingMask(\n    inputs: Tensor|Tensor[],\n    kwargs: GPT2PreprocessorOptions\n  ): NamedTensorMap {\n    return tidy(() => {\n      if (inputs instanceof Array) {\n        if (inputs.length !== 1) {\n          throw new ValueError(\n            'GPT2 requires each input feature to contain only ' +\n            `one segment, but received ${inputs.length}. If you are using ` +\n            'GPT2 for a multi-segment classification task, please refer to ' +\n            'classification models like BERT or RoBERTa.'\n          );\n        }\n        inputs = inputs[0];\n      }\n\n      const sequenceLength = kwargs.sequenceLength ?? this.sequenceLength;\n      const [tokenIds, paddingMask] = this.packer.callAndReturnPaddingMask(\n        this.tokenizer.call(inputs),\n        {\n          sequenceLength,\n          addStartValue: this.addStartToken,\n          addEndValue: this.addEndToken\n        }\n      );\n\n      return {\n        tokenIds: tokenIds as Tensor2D,\n        paddingMask: paddingMask as Tensor2D\n      };\n    });\n  }\n\n  /**\n   * Calls the layer and returns extra information like the paddingMask used to\n   * pack the sequence, the label data, and the sample weights used.\n   */\n  callAndPackArgs(inputs: Tensor|Tensor[], kwargs: GPT2PreprocessorOptions):\n    NamedTensorMap\n    | [NamedTensorMap, Tensor]\n    | [NamedTensorMap, Tensor, Tensor] {\n    const x = this.callAndReturnPaddingMask(inputs, kwargs);\n    return packXYSampleWeight(x, kwargs.y, kwargs.sampleWeight);\n  }\n\n  static override tokenizerCls<T extends serialization.Serializable>(\n    cls: serialization.SerializableConstructor<T>) {\n    return GPT2Tokenizer;\n  }\n}\nserialization.registerClass(GPT2Preprocessor);\n"]}