gx
chenyc
2025-02-12 ea42ff3ebee1eeb3fb29423aa848a249441db81c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/**
 * @license
 * Copyright 2021 Google LLC. All Rights Reserved.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =============================================================================
 */
import { ENGINE } from '../../engine';
import { StringNGrams } from '../../kernel_names';
import { convertToTensor } from '../../tensor_util_env';
import { op } from '../operation';
/**
 * Creates ngrams from ragged string data.
 *
 * This op accepts a ragged tensor with 1 ragged dimension containing only
 * strings and outputs a ragged tensor with 1 ragged dimension containing ngrams
 * of that string, joined along the innermost axis.
 *
 * ```js
 * const result = tf.string.stringNGrams(
 *   ['a', 'b', 'c', 'd'], tf.tensor1d([0, 2, 4], 'int32'),
 *   '|', [1, 2], 'LP', 'RP', -1, false);
 * result['nGrams'].print(); // ['a', 'b', 'LP|a', 'a|b', 'b|RP',
 *                           //  'c', 'd', 'LP|c', 'c|d', 'd|RP']
 * result['nGramsSplits'].print(); // [0, 5, 10]
 * ```
 * @param data: The values tensor of the ragged string tensor to make ngrams out
 *     of. Must be a 1D string tensor.
 * @param dataSplits: The splits tensor of the ragged string tensor to make
 *     ngrams out of.
 * @param separator: The string to append between elements of the token. Use ""
 *     for no separator.
 * @param nGramWidths: The sizes of the ngrams to create.
 * @param leftPad: The string to use to pad the left side of the ngram sequence.
 *     Only used if pad_width !== 0.
 * @param rightPad: The string to use to pad the right side of the ngram
 *     sequence. Only used if pad_width !== 0.
 * @param padWidth: The number of padding elements to add to each side of each
 *     sequence. Note that padding will never be greater than `nGramWidths`-1
 *     regardless of this value. If `padWidth`=-1, then add max(`nGramWidths`)-1
 *     elements.
 * @param preserveShortSequences: If true, then ensure that at least one ngram
 *     is generated for each input sequence. In particular, if an input sequence
 *     is shorter than min(ngramWidth) + 2*padWidth, then generate a single
 *     ngram containing the entire sequence. If false, then no ngrams are
 *     generated for these short input sequences.
 * @return A map with the following properties:
 *     - nGrams: The values tensor of the output ngrams ragged tensor.
 *     - nGramsSplits: The splits tensor of the output ngrams ragged tensor.
 *
 * @doc {heading: 'Operations', subheading: 'String'}
 */
function stringNGrams_(data, dataSplits, separator, nGramWidths, leftPad, rightPad, padWidth, preserveShortSequences) {
    const $data = convertToTensor(data, 'data', 'stringNGrams', 'string');
    if ($data.dtype !== 'string') {
        throw new Error('Data must be of datatype string');
    }
    if ($data.shape.length !== 1) {
        throw new Error(`Data must be a vector, saw: ${$data.shape}`);
    }
    const $dataSplits = convertToTensor(dataSplits, 'dataSplits', 'stringNGrams');
    if ($dataSplits.dtype !== 'int32') {
        throw new Error('Data splits must be of datatype int32');
    }
    const attrs = {
        separator,
        nGramWidths,
        leftPad,
        rightPad,
        padWidth,
        preserveShortSequences
    };
    const inputs = { data: $data, dataSplits: $dataSplits };
    const result = ENGINE.runKernel(StringNGrams, inputs, attrs);
    return { nGrams: result[0], nGramsSplits: result[1] };
}
export const stringNGrams = /* @__PURE__ */ op({ stringNGrams_ });
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic3RyaW5nX25fZ3JhbXMuanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi8uLi8uLi8uLi8uLi8uLi90ZmpzLWNvcmUvc3JjL29wcy9zdHJpbmcvc3RyaW5nX25fZ3JhbXMudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUE7Ozs7Ozs7Ozs7Ozs7OztHQWVHO0FBRUgsT0FBTyxFQUFDLE1BQU0sRUFBQyxNQUFNLGNBQWMsQ0FBQztBQUNwQyxPQUFPLEVBQUMsWUFBWSxFQUF3QyxNQUFNLG9CQUFvQixDQUFDO0FBR3ZGLE9BQU8sRUFBQyxlQUFlLEVBQUMsTUFBTSx1QkFBdUIsQ0FBQztBQUV0RCxPQUFPLEVBQUMsRUFBRSxFQUFDLE1BQU0sY0FBYyxDQUFDO0FBRWhDOzs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7O0dBd0NHO0FBQ0gsU0FBUyxhQUFhLENBQ2xCLElBQXlCLEVBQUUsVUFBNkIsRUFBRSxTQUFpQixFQUMzRSxXQUFxQixFQUFFLE9BQWUsRUFBRSxRQUFnQixFQUFFLFFBQWdCLEVBQzFFLHNCQUErQjtJQUNqQyxNQUFNLEtBQUssR0FBRyxlQUFlLENBQUMsSUFBSSxFQUFFLE1BQU0sRUFBRSxjQUFjLEVBQUUsUUFBUSxDQUFDLENBQUM7SUFDdEUsSUFBSSxLQUFLLENBQUMsS0FBSyxLQUFLLFFBQVEsRUFBRTtRQUM1QixNQUFNLElBQUksS0FBSyxDQUFDLGlDQUFpQyxDQUFDLENBQUM7S0FDcEQ7SUFDRCxJQUFJLEtBQUssQ0FBQyxLQUFLLENBQUMsTUFBTSxLQUFLLENBQUMsRUFBRTtRQUM1QixNQUFNLElBQUksS0FBSyxDQUFDLCtCQUErQixLQUFLLENBQUMsS0FBSyxFQUFFLENBQUMsQ0FBQztLQUMvRDtJQUVELE1BQU0sV0FBVyxHQUFHLGVBQWUsQ0FBQyxVQUFVLEVBQUUsWUFBWSxFQUFFLGNBQWMsQ0FBQyxDQUFDO0lBQzlFLElBQUksV0FBVyxDQUFDLEtBQUssS0FBSyxPQUFPLEVBQUU7UUFDakMsTUFBTSxJQUFJLEtBQUssQ0FBQyx1Q0FBdUMsQ0FBQyxDQUFDO0tBQzFEO0lBRUQsTUFBTSxLQUFLLEdBQXNCO1FBQy9CLFNBQVM7UUFDVCxXQUFXO1FBQ1gsT0FBTztRQUNQLFFBQVE7UUFDUixRQUFRO1FBQ1Isc0JBQXNCO0tBQ3ZCLENBQUM7SUFFRixNQUFNLE1BQU0sR0FBdUIsRUFBQyxJQUFJLEVBQUUsS0FBSyxFQUFFLFVBQVUsRUFBRSxXQUFXLEVBQUMsQ0FBQztJQUMxRSxNQUFNLE1BQU0sR0FDUixNQUFNLENBQUMsU0FBUyxDQUFDLFlBQVksRUFBRSxNQUFZLEVBQUUsS0FBVyxDQUFDLENBQUM7SUFDOUQsT0FBTyxFQUFDLE1BQU0sRUFBRSxNQUFNLENBQUMsQ0FBQyxDQUFDLEVBQUUsWUFBWSxFQUFFLE1BQU0sQ0FBQyxDQUFDLENBQUMsRUFBQyxDQUFDO0FBQ3RELENBQUM7QUFFRCxNQUFNLENBQUMsTUFBTSxZQUFZLEdBQUcsZUFBZSxDQUFDLEVBQUUsQ0FBQyxFQUFDLGFBQWEsRUFBQyxDQUFDLENBQUMiLCJzb3VyY2VzQ29udGVudCI6WyIvKipcbiAqIEBsaWNlbnNlXG4gKiBDb3B5cmlnaHQgMjAyMSBHb29nbGUgTExDLiBBbGwgUmlnaHRzIFJlc2VydmVkLlxuICogTGljZW5zZWQgdW5kZXIgdGhlIEFwYWNoZSBMaWNlbnNlLCBWZXJzaW9uIDIuMCAodGhlIFwiTGljZW5zZVwiKTtcbiAqIHlvdSBtYXkgbm90IHVzZSB0aGlzIGZpbGUgZXhjZXB0IGluIGNvbXBsaWFuY2Ugd2l0aCB0aGUgTGljZW5zZS5cbiAqIFlvdSBtYXkgb2J0YWluIGEgY29weSBvZiB0aGUgTGljZW5zZSBhdFxuICpcbiAqIGh0dHA6Ly93d3cuYXBhY2hlLm9yZy9saWNlbnNlcy9MSUNFTlNFLTIuMFxuICpcbiAqIFVubGVzcyByZXF1aXJlZCBieSBhcHBsaWNhYmxlIGxhdyBvciBhZ3JlZWQgdG8gaW4gd3JpdGluZywgc29mdHdhcmVcbiAqIGRpc3RyaWJ1dGVkIHVuZGVyIHRoZSBMaWNlbnNlIGlzIGRpc3RyaWJ1dGVkIG9uIGFuIFwiQVMgSVNcIiBCQVNJUyxcbiAqIFdJVEhPVVQgV0FSUkFOVElFUyBPUiBDT05ESVRJT05TIE9GIEFOWSBLSU5ELCBlaXRoZXIgZXhwcmVzcyBvciBpbXBsaWVkLlxuICogU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZFxuICogbGltaXRhdGlvbnMgdW5kZXIgdGhlIExpY2Vuc2UuXG4gKiA9PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PVxuICovXG5cbmltcG9ydCB7RU5HSU5FfSBmcm9tICcuLi8uLi9lbmdpbmUnO1xuaW1wb3J0IHtTdHJpbmdOR3JhbXMsIFN0cmluZ05HcmFtc0F0dHJzLCBTdHJpbmdOR3JhbXNJbnB1dHN9IGZyb20gJy4uLy4uL2tlcm5lbF9uYW1lcyc7XG5pbXBvcnQge1RlbnNvciwgVGVuc29yMUR9IGZyb20gJy4uLy4uL3RlbnNvcic7XG5pbXBvcnQge05hbWVkVGVuc29yTWFwfSBmcm9tICcuLi8uLi90ZW5zb3JfdHlwZXMnO1xuaW1wb3J0IHtjb252ZXJ0VG9UZW5zb3J9IGZyb20gJy4uLy4uL3RlbnNvcl91dGlsX2Vudic7XG5pbXBvcnQge1RlbnNvckxpa2V9IGZyb20gJy4uLy4uL3R5cGVzJztcbmltcG9ydCB7b3B9IGZyb20gJy4uL29wZXJhdGlvbic7XG5cbi8qKlxuICogQ3JlYXRlcyBuZ3JhbXMgZnJvbSByYWdnZWQgc3RyaW5nIGRhdGEuXG4gKlxuICogVGhpcyBvcCBhY2NlcHRzIGEgcmFnZ2VkIHRlbnNvciB3aXRoIDEgcmFnZ2VkIGRpbWVuc2lvbiBjb250YWluaW5nIG9ubHlcbiAqIHN0cmluZ3MgYW5kIG91dHB1dHMgYSByYWdnZWQgdGVuc29yIHdpdGggMSByYWdnZWQgZGltZW5zaW9uIGNvbnRhaW5pbmcgbmdyYW1zXG4gKiBvZiB0aGF0IHN0cmluZywgam9pbmVkIGFsb25nIHRoZSBpbm5lcm1vc3QgYXhpcy5cbiAqXG4gKiBgYGBqc1xuICogY29uc3QgcmVzdWx0ID0gdGYuc3RyaW5nLnN0cmluZ05HcmFtcyhcbiAqICAgWydhJywgJ2InLCAnYycsICdkJ10sIHRmLnRlbnNvcjFkKFswLCAyLCA0XSwgJ2ludDMyJyksXG4gKiAgICd8JywgWzEsIDJdLCAnTFAnLCAnUlAnLCAtMSwgZmFsc2UpO1xuICogcmVzdWx0WyduR3JhbXMnXS5wcmludCgpOyAvLyBbJ2EnLCAnYicsICdMUHxhJywgJ2F8YicsICdifFJQJyxcbiAqICAgICAgICAgICAgICAgICAgICAgICAgICAgLy8gICdjJywgJ2QnLCAnTFB8YycsICdjfGQnLCAnZHxSUCddXG4gKiByZXN1bHRbJ25HcmFtc1NwbGl0cyddLnByaW50KCk7IC8vIFswLCA1LCAxMF1cbiAqIGBgYFxuICogQHBhcmFtIGRhdGE6IFRoZSB2YWx1ZXMgdGVuc29yIG9mIHRoZSByYWdnZWQgc3RyaW5nIHRlbnNvciB0byBtYWtlIG5ncmFtcyBvdXRcbiAqICAgICBvZi4gTXVzdCBiZSBhIDFEIHN0cmluZyB0ZW5zb3IuXG4gKiBAcGFyYW0gZGF0YVNwbGl0czogVGhlIHNwbGl0cyB0ZW5zb3Igb2YgdGhlIHJhZ2dlZCBzdHJpbmcgdGVuc29yIHRvIG1ha2VcbiAqICAgICBuZ3JhbXMgb3V0IG9mLlxuICogQHBhcmFtIHNlcGFyYXRvcjogVGhlIHN0cmluZyB0byBhcHBlbmQgYmV0d2VlbiBlbGVtZW50cyBvZiB0aGUgdG9rZW4uIFVzZSBcIlwiXG4gKiAgICAgZm9yIG5vIHNlcGFyYXRvci5cbiAqIEBwYXJhbSBuR3JhbVdpZHRoczogVGhlIHNpemVzIG9mIHRoZSBuZ3JhbXMgdG8gY3JlYXRlLlxuICogQHBhcmFtIGxlZnRQYWQ6IFRoZSBzdHJpbmcgdG8gdXNlIHRvIHBhZCB0aGUgbGVmdCBzaWRlIG9mIHRoZSBuZ3JhbSBzZXF1ZW5jZS5cbiAqICAgICBPbmx5IHVzZWQgaWYgcGFkX3dpZHRoICE9PSAwLlxuICogQHBhcmFtIHJpZ2h0UGFkOiBUaGUgc3RyaW5nIHRvIHVzZSB0byBwYWQgdGhlIHJpZ2h0IHNpZGUgb2YgdGhlIG5ncmFtXG4gKiAgICAgc2VxdWVuY2UuIE9ubHkgdXNlZCBpZiBwYWRfd2lkdGggIT09IDAuXG4gKiBAcGFyYW0gcGFkV2lkdGg6IFRoZSBudW1iZXIgb2YgcGFkZGluZyBlbGVtZW50cyB0byBhZGQgdG8gZWFjaCBzaWRlIG9mIGVhY2hcbiAqICAgICBzZXF1ZW5jZS4gTm90ZSB0aGF0IHBhZGRpbmcgd2lsbCBuZXZlciBiZSBncmVhdGVyIHRoYW4gYG5HcmFtV2lkdGhzYC0xXG4gKiAgICAgcmVnYXJkbGVzcyBvZiB0aGlzIHZhbHVlLiBJZiBgcGFkV2lkdGhgPS0xLCB0aGVuIGFkZCBtYXgoYG5HcmFtV2lkdGhzYCktMVxuICogICAgIGVsZW1lbnRzLlxuICogQHBhcmFtIHByZXNlcnZlU2hvcnRTZXF1ZW5jZXM6IElmIHRydWUsIHRoZW4gZW5zdXJlIHRoYXQgYXQgbGVhc3Qgb25lIG5ncmFtXG4gKiAgICAgaXMgZ2VuZXJhdGVkIGZvciBlYWNoIGlucHV0IHNlcXVlbmNlLiBJbiBwYXJ0aWN1bGFyLCBpZiBhbiBpbnB1dCBzZXF1ZW5jZVxuICogICAgIGlzIHNob3J0ZXIgdGhhbiBtaW4obmdyYW1XaWR0aCkgKyAyKnBhZFdpZHRoLCB0aGVuIGdlbmVyYXRlIGEgc2luZ2xlXG4gKiAgICAgbmdyYW0gY29udGFpbmluZyB0aGUgZW50aXJlIHNlcXVlbmNlLiBJZiBmYWxzZSwgdGhlbiBubyBuZ3JhbXMgYXJlXG4gKiAgICAgZ2VuZXJhdGVkIGZvciB0aGVzZSBzaG9ydCBpbnB1dCBzZXF1ZW5jZXMuXG4gKiBAcmV0dXJuIEEgbWFwIHdpdGggdGhlIGZvbGxvd2luZyBwcm9wZXJ0aWVzOlxuICogICAgIC0gbkdyYW1zOiBUaGUgdmFsdWVzIHRlbnNvciBvZiB0aGUgb3V0cHV0IG5ncmFtcyByYWdnZWQgdGVuc29yLlxuICogICAgIC0gbkdyYW1zU3BsaXRzOiBUaGUgc3BsaXRzIHRlbnNvciBvZiB0aGUgb3V0cHV0IG5ncmFtcyByYWdnZWQgdGVuc29yLlxuICpcbiAqIEBkb2Mge2hlYWRpbmc6ICdPcGVyYXRpb25zJywgc3ViaGVhZGluZzogJ1N0cmluZyd9XG4gKi9cbmZ1bmN0aW9uIHN0cmluZ05HcmFtc18oXG4gICAgZGF0YTogVGVuc29yMUR8VGVuc29yTGlrZSwgZGF0YVNwbGl0czogVGVuc29yfFRlbnNvckxpa2UsIHNlcGFyYXRvcjogc3RyaW5nLFxuICAgIG5HcmFtV2lkdGhzOiBudW1iZXJbXSwgbGVmdFBhZDogc3RyaW5nLCByaWdodFBhZDogc3RyaW5nLCBwYWRXaWR0aDogbnVtYmVyLFxuICAgIHByZXNlcnZlU2hvcnRTZXF1ZW5jZXM6IGJvb2xlYW4pOiBOYW1lZFRlbnNvck1hcCB7XG4gIGNvbnN0ICRkYXRhID0gY29udmVydFRvVGVuc29yKGRhdGEsICdkYXRhJywgJ3N0cmluZ05HcmFtcycsICdzdHJpbmcnKTtcbiAgaWYgKCRkYXRhLmR0eXBlICE9PSAnc3RyaW5nJykge1xuICAgIHRocm93IG5ldyBFcnJvcignRGF0YSBtdXN0IGJlIG9mIGRhdGF0eXBlIHN0cmluZycpO1xuICB9XG4gIGlmICgkZGF0YS5zaGFwZS5sZW5ndGggIT09IDEpIHtcbiAgICB0aHJvdyBuZXcgRXJyb3IoYERhdGEgbXVzdCBiZSBhIHZlY3Rvciwgc2F3OiAkeyRkYXRhLnNoYXBlfWApO1xuICB9XG5cbiAgY29uc3QgJGRhdGFTcGxpdHMgPSBjb252ZXJ0VG9UZW5zb3IoZGF0YVNwbGl0cywgJ2RhdGFTcGxpdHMnLCAnc3RyaW5nTkdyYW1zJyk7XG4gIGlmICgkZGF0YVNwbGl0cy5kdHlwZSAhPT0gJ2ludDMyJykge1xuICAgIHRocm93IG5ldyBFcnJvcignRGF0YSBzcGxpdHMgbXVzdCBiZSBvZiBkYXRhdHlwZSBpbnQzMicpO1xuICB9XG5cbiAgY29uc3QgYXR0cnM6IFN0cmluZ05HcmFtc0F0dHJzID0ge1xuICAgIHNlcGFyYXRvcixcbiAgICBuR3JhbVdpZHRocyxcbiAgICBsZWZ0UGFkLFxuICAgIHJpZ2h0UGFkLFxuICAgIHBhZFdpZHRoLFxuICAgIHByZXNlcnZlU2hvcnRTZXF1ZW5jZXNcbiAgfTtcblxuICBjb25zdCBpbnB1dHM6IFN0cmluZ05HcmFtc0lucHV0cyA9IHtkYXRhOiAkZGF0YSwgZGF0YVNwbGl0czogJGRhdGFTcGxpdHN9O1xuICBjb25zdCByZXN1bHQ6IFRlbnNvcltdID1cbiAgICAgIEVOR0lORS5ydW5LZXJuZWwoU3RyaW5nTkdyYW1zLCBpbnB1dHMgYXMge30sIGF0dHJzIGFzIHt9KTtcbiAgcmV0dXJuIHtuR3JhbXM6IHJlc3VsdFswXSwgbkdyYW1zU3BsaXRzOiByZXN1bHRbMV19O1xufVxuXG5leHBvcnQgY29uc3Qgc3RyaW5nTkdyYW1zID0gLyogQF9fUFVSRV9fICovIG9wKHtzdHJpbmdOR3JhbXNffSk7XG4iXX0=