/**
|
* @license
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
* you may not use this file except in compliance with the License.
|
* You may obtain a copy of the License at
|
*
|
* http://www.apache.org/licenses/LICENSE-2.0
|
*
|
* Unless required by applicable law or agreed to in writing, software
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* See the License for the specific language governing permissions and
|
* limitations under the License.
|
*
|
* =============================================================================
|
*/
|
/// <amd-module name="@tensorflow/tfjs-data/dist/dataset" />
|
import * as tf from '@tensorflow/tfjs-core';
|
import { TensorContainer } from '@tensorflow/tfjs-core';
|
import { LazyIterator } from './iterators/lazy_iterator';
|
import { Container } from './types';
|
/**
|
* A nested structure of Datasets, used as the input to zip().
|
*/
|
export type DatasetContainer = Container<Dataset<TensorContainer>>;
|
/**
|
* Represents a potentially large list of independent data elements (typically
|
* 'samples' or 'examples').
|
*
|
* A 'data example' may be a primitive, an array, a map from string keys to
|
* values, or any nested structure of these.
|
*
|
* A `Dataset` represents an ordered collection of elements, together with a
|
* chain of transformations to be performed on those elements. Each
|
* transformation is a method of `Dataset` that returns another `Dataset`, so
|
* these may be chained, e.g.
|
* `const processedDataset = rawDataset.filter(...).map(...).batch(...)`.
|
*
|
* Data loading and transformation is done in a lazy, streaming fashion. The
|
* dataset may be iterated over multiple times; each iteration starts the data
|
* loading anew and recapitulates the transformations.
|
*
|
* A `Dataset` is typically processed as a stream of unbatched examples -- i.e.,
|
* its transformations are applied one example at a time. Batching produces a
|
* new `Dataset` where each element is a batch. Batching should usually come
|
* last in a pipeline, because data transformations are easier to express on a
|
* per-example basis than on a per-batch basis.
|
*
|
* The following code examples are calling `await dataset.forEachAsync(...)` to
|
* iterate once over the entire dataset in order to print out the data.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes', namespace: 'data'}
|
*/
|
export declare abstract class Dataset<T extends tf.TensorContainer> {
|
abstract iterator(): Promise<LazyIterator<T>>;
|
readonly size: number;
|
/**
|
* Groups elements into batches.
|
*
|
* It is assumed that each of the incoming dataset elements has the same
|
* structure -- i.e. the same set of keys at each location in an object
|
* hierarchy. For each key, the resulting `Dataset` provides a batched
|
* element collecting all of the incoming values for that key.
|
*
|
* * Incoming primitives are grouped into a 1-D Tensor.
|
* * Incoming Tensors are grouped into a new Tensor where the 0th axis is
|
* the batch dimension.
|
* * Incoming arrays are converted to Tensor and then batched.
|
* * A nested array is interpreted as an n-D Tensor, so the batched result
|
* has n+1 dimensions.
|
* * An array that cannot be converted to Tensor produces an error.
|
*
|
* If an array should not be batched as a unit, it should first be converted
|
* to an object with integer keys.
|
*
|
* Here are a few examples:
|
*
|
* Batch a dataset of numbers:
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6, 7, 8]).batch(4);
|
* await a.forEachAsync(e => e.print());
|
* ```
|
*
|
* Batch a dataset of arrays:
|
* ```js
|
* const b = tf.data.array([[1], [2], [3], [4], [5], [6], [7], [8]]).batch(4);
|
* await b.forEachAsync(e => e.print());
|
* ```
|
*
|
* Batch a dataset of objects:
|
* ```js
|
* const c = tf.data.array([{a: 1, b: 11}, {a: 2, b: 12}, {a: 3, b: 13},
|
* {a: 4, b: 14}, {a: 5, b: 15}, {a: 6, b: 16}, {a: 7, b: 17},
|
* {a: 8, b: 18}]).batch(4);
|
* await c.forEachAsync(e => {
|
* console.log('{');
|
* for(var key in e) {
|
* console.log(key+':');
|
* e[key].print();
|
* }
|
* console.log('}');
|
* })
|
* ```
|
*
|
* @param batchSize The number of elements desired per batch.
|
* @param smallLastBatch Whether to emit the final batch when it has fewer
|
* than batchSize elements. Default true.
|
* @returns A `Dataset`, from which a stream of batches can be obtained.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
batch(batchSize: number, smallLastBatch?: boolean): Dataset<tf.TensorContainer>;
|
/**
|
* Concatenates this `Dataset` with another.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3]);
|
* const b = tf.data.array([4, 5, 6]);
|
* const c = a.concatenate(b);
|
* await c.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param dataset A `Dataset` to be concatenated onto this one.
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
concatenate(dataset: Dataset<T>): Dataset<T>;
|
/**
|
* Filters this dataset according to `predicate`.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
|
* .filter(x => x%2 === 0);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param predicate A function mapping a dataset element to a boolean or a
|
* `Promise` for one.
|
*
|
* @returns A `Dataset` of elements for which the predicate was true.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
filter(predicate: (value: T) => boolean): Dataset<T>;
|
/**
|
* Apply a function to every element of the dataset.
|
*
|
* After the function is applied to a dataset element, any Tensors contained
|
* within that element are disposed.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3]);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param f A function to apply to each dataset element.
|
* @returns A `Promise` that resolves after all elements have been processed.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
forEachAsync(f: (input: T) => void): Promise<void>;
|
/**
|
* Maps this dataset through a 1-to-1 transform.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3]).map(x => x*x);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param transform A function mapping a dataset element to a transformed
|
* dataset element.
|
*
|
* @returns A `Dataset` of transformed elements.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
map<O extends tf.TensorContainer>(transform: (value: T) => O): Dataset<O>;
|
/**
|
* Maps this dataset through an async 1-to-1 transform.
|
*
|
* ```js
|
* const a =
|
* tf.data.array([1, 2, 3]).mapAsync(x => new Promise(function(resolve){
|
* setTimeout(() => {
|
* resolve(x * x);
|
* }, Math.random()*1000 + 500);
|
* }));
|
* console.log(await a.toArray());
|
* ```
|
*
|
* @param transform A function mapping a dataset element to a `Promise` for a
|
* transformed dataset element. This transform is responsible for disposing
|
* any intermediate `Tensor`s, i.e. by wrapping its computation in
|
* `tf.tidy()`; that cannot be automated here (as it is in the synchronous
|
* `map()` case).
|
*
|
* @returns A `Dataset` of transformed elements.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
mapAsync<O extends tf.TensorContainer>(transform: (value: T) => Promise<O>): Dataset<O>;
|
/**
|
* Creates a `Dataset` that prefetches elements from this dataset.
|
*
|
* @param bufferSize: An integer specifying the number of elements to be
|
* prefetched.
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
prefetch(bufferSize: number): Dataset<T>;
|
/**
|
* Repeats this dataset `count` times.
|
*
|
* NOTE: If this dataset is a function of global state (e.g. a random number
|
* generator), then different repetitions may produce different elements.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3]).repeat(3);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param count: (Optional) An integer, representing the number of times
|
* the dataset should be repeated. The default behavior (if `count` is
|
* `undefined` or negative) is for the dataset be repeated indefinitely.
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
repeat(count?: number): Dataset<T>;
|
/**
|
* Creates a `Dataset` that skips `count` initial elements from this dataset.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6]).skip(3);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param count: The number of elements of this dataset that should be skipped
|
* to form the new dataset. If `count` is greater than the size of this
|
* dataset, the new dataset will contain no elements. If `count`
|
* is `undefined` or negative, skips the entire dataset.
|
*
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
skip(count: number): Dataset<T>;
|
static readonly MAX_BUFFER_SIZE = 10000;
|
/**
|
* Pseudorandomly shuffles the elements of this dataset. This is done in a
|
* streaming manner, by sampling from a given number of prefetched elements.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6]).shuffle(3);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param bufferSize: An integer specifying the number of elements from this
|
* dataset from which the new dataset will sample.
|
* @param seed: (Optional) An integer specifying the random seed that will
|
* be used to create the distribution.
|
* @param reshuffleEachIteration: (Optional) A boolean, which if true
|
* indicates that the dataset should be pseudorandomly reshuffled each time
|
* it is iterated over. If false, elements will be returned in the same
|
* shuffled order on each iteration. (Defaults to `true`.)
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
shuffle(bufferSize: number, seed?: string, reshuffleEachIteration?: boolean): Dataset<T>;
|
/**
|
* Creates a `Dataset` with at most `count` initial elements from this
|
* dataset.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6]).take(3);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* @param count: The number of elements of this dataset that should be taken
|
* to form the new dataset. If `count` is `undefined` or negative, or if
|
* `count` is greater than the size of this dataset, the new dataset will
|
* contain all elements of this dataset.
|
* @returns A `Dataset`.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
take(count: number): Dataset<T>;
|
/**
|
* Collect all elements of this dataset into an array.
|
*
|
* Obviously this will succeed only for small datasets that fit in memory.
|
* Useful for testing and generally should be avoided if possible.
|
*
|
* ```js
|
* const a = tf.data.array([1, 2, 3, 4, 5, 6]);
|
* console.log(await a.toArray());
|
* ```
|
*
|
* @returns A Promise for an array of elements, which will resolve
|
* when a new stream has been obtained and fully consumed.
|
*
|
* @doc {heading: 'Data', subheading: 'Classes'}
|
*/
|
toArray(): Promise<T[]>;
|
/**
|
* Collect all elements of this dataset into an array with prefetching 100
|
* elements. This is useful for testing, because the prefetch changes the
|
* order in which the Promises are resolved along the processing pipeline.
|
* This may help expose bugs where results are dependent on the order of
|
* Promise resolution rather than on the logical order of the stream (i.e.,
|
* due to hidden mutable state).
|
*
|
* @returns A Promise for an array of elements, which will resolve
|
* when a new stream has been obtained and fully consumed.
|
*/
|
toArrayForTest(): Promise<T[]>;
|
}
|
/**
|
* Create a `Dataset` defined by a provided iterator() function.
|
*
|
* ```js
|
* let i = -1;
|
* const func = () =>
|
* ++i < 5 ? {value: i, done: false} : {value: null, done: true};
|
* const iter = tf.data.iteratorFromFunction(func);
|
* const ds = tf.data.datasetFromIteratorFn(iter);
|
* await ds.forEachAsync(e => console.log(e));
|
* ```
|
*/
|
export declare function datasetFromIteratorFn<T extends tf.TensorContainer>(iteratorFn: () => Promise<LazyIterator<T>>, size?: number): Dataset<T>;
|
/**
|
* Create a `Dataset` from an array of elements.
|
*
|
* Create a Dataset from an array of objects:
|
* ```js
|
* const a = tf.data.array([{'item': 1}, {'item': 2}, {'item': 3}]);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* Create a Dataset from an array of numbers:
|
* ```js
|
* const a = tf.data.array([4, 5, 6]);
|
* await a.forEachAsync(e => console.log(e));
|
* ```
|
* @param items An array of elements that will be parsed as items in a dataset.
|
*
|
* @doc {heading: 'Data', subheading: 'Creation', namespace: 'data'}
|
*/
|
export declare function array<T extends tf.TensorContainer>(items: T[]): Dataset<T>;
|
/**
|
* Create a `Dataset` by zipping together an array, dict, or nested
|
* structure of `Dataset`s (and perhaps additional constants).
|
* The underlying datasets must provide elements in a consistent order such that
|
* they correspond.
|
*
|
* The number of elements in the resulting dataset is the same as the size of
|
* the smallest dataset in datasets.
|
*
|
* The nested structure of the `datasets` argument determines the
|
* structure of elements in the resulting iterator.
|
*
|
* Note this means that, given an array of two datasets that produce dict
|
* elements, the result is a dataset that produces elements that are arrays
|
* of two dicts:
|
*
|
* Zip an array of datasets:
|
* ```js
|
* console.log('Zip two datasets of objects:');
|
* const ds1 = tf.data.array([{a: 1}, {a: 2}, {a: 3}]);
|
* const ds2 = tf.data.array([{b: 4}, {b: 5}, {b: 6}]);
|
* const ds3 = tf.data.zip([ds1, ds2]);
|
* await ds3.forEachAsync(e => console.log(JSON.stringify(e)));
|
*
|
* // If the goal is to merge the dicts in order to produce elements like
|
* // {a: ..., b: ...}, this requires a second step such as:
|
* console.log('Merge the objects:');
|
* const ds4 = ds3.map(x => {return {a: x[0].a, b: x[1].b}});
|
* await ds4.forEachAsync(e => console.log(e));
|
* ```
|
*
|
* Zip a dict of datasets:
|
* ```js
|
* const a = tf.data.array([{a: 1}, {a: 2}, {a: 3}]);
|
* const b = tf.data.array([{b: 4}, {b: 5}, {b: 6}]);
|
* const c = tf.data.zip({c: a, d: b});
|
* await c.forEachAsync(e => console.log(JSON.stringify(e)));
|
* ```
|
*
|
* @doc {heading: 'Data', subheading: 'Operations', namespace: 'data'}
|
*/
|
export declare function zip<O extends tf.TensorContainer>(datasets: DatasetContainer): Dataset<O>;
|