/**
|
* @license
|
* Copyright 2022 Google LLC. All Rights Reserved.
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
* you may not use this file except in compliance with the License.
|
* You may obtain a copy of the License at
|
*
|
* http://www.apache.org/licenses/LICENSE-2.0
|
*
|
* Unless required by applicable law or agreed to in writing, software
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* See the License for the specific language governing permissions and
|
* limitations under the License.
|
* =============================================================================
|
*/
|
import { util } from '@tensorflow/tfjs-core';
|
import { useShapeUniforms } from './gpgpu_math';
|
export class Conv2DPackedProgram {
|
constructor(convInfo, addBias = false, activation = null, hasPreluActivation = false, hasLeakyReluAlpha = false) {
|
this.variableNames = ['x', 'W'];
|
this.packedInputs = true;
|
this.packedOutput = true;
|
this.customUniforms = [
|
{ name: 'pads', type: 'ivec2' },
|
{ name: 'strides', type: 'ivec2' },
|
{ name: 'dilations', type: 'ivec2' },
|
{ name: 'inDims', type: 'ivec2' },
|
];
|
this.outputShape = convInfo.outShape;
|
this.enableShapeUniforms = useShapeUniforms(this.outputShape.length);
|
const padLeft = convInfo.padInfo.left;
|
const strideWidth = convInfo.strideWidth;
|
const dilationWidth = convInfo.dilationWidth;
|
const filterHeight = convInfo.filterHeight;
|
const filterWidth = convInfo.filterWidth;
|
const texelsAcross = filterWidth;
|
let mainLoop = `
|
int xR; int xC; int xCOffset;
|
vec4 wTexel; vec4 previous; vec4 final;`;
|
for (let c = 0; c < filterWidth; c++) {
|
mainLoop += `
|
vec4 xTexelC${c * 2};
|
int xTexelC${c * 2}Ready;
|
vec4 xTexelC${c * 2 + 1};
|
int xTexelC${c * 2 + 1}Ready;
|
vec4 xC${c};`;
|
}
|
/**
|
* This vectorized implementation works by gathering the values needed for
|
* each output channel's dot product into vec4's and then multiplying them
|
* all together (this happens in the final double for-loop below). Most of
|
* the main loop consists of constructing these vec4's with the minimum
|
* number of texture2D calls, which means making use of all four returned
|
* values from a texture2D call at once.
|
*/
|
mainLoop += `
|
for (int r = 0; r < ${filterHeight}; r++) {
|
for (int d1 = 0; d1 < ${convInfo.inChannels}; d1 += 2) {
|
`;
|
for (let c = 0; c < filterWidth; c++) {
|
mainLoop += `
|
xTexelC${c * 2} = vec4(0.0);
|
xTexelC${c * 2}Ready = 0;
|
xTexelC${c * 2 + 1} = vec4(0.0);
|
xTexelC${c * 2 + 1}Ready = 0;
|
xC${c} = vec4(0.0);`;
|
}
|
mainLoop += `
|
xR = xRCorner + r * dilations[0];
|
if (xR >=0 && xR < inDims[0]) {
|
`;
|
for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) {
|
const colIndex = texelC * 2;
|
mainLoop += `
|
xC = xCCorner + ${colIndex * dilationWidth};
|
`;
|
if (strideWidth === 1) {
|
if (colIndex < filterWidth) {
|
// If padding is odd, the outer texels have to be composed.
|
if (padLeft % 2 === 1) {
|
// TODO: Ensure vec4 previous does not result in redundant sample,
|
// and avoid setting xTexelRC's that exceed the boundary in the
|
// first place rather than resetting them to vec4(0)).
|
// To compute xCOffset:
|
// - If padding is odd, we must add 1 to ensure we ask for an
|
// even-numbered row.
|
// - We subtract 2 to access the previous texel.
|
mainLoop += `
|
xCOffset = xC + 1;
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
`;
|
// This texel has been read in previous iteration if the dilation
|
// is 1.
|
if (dilationWidth === 1 && colIndex > 0) {
|
mainLoop += `
|
xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${colIndex}.xy);
|
`;
|
}
|
else {
|
mainLoop += `
|
xCOffset = xC + 1 - 2;
|
|
if (xCOffset >= 0 && xCOffset < inDims[1]) {
|
previous = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
previous.zw = vec2(0.0);
|
}
|
|
xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy);
|
} else {
|
xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy);
|
}
|
`;
|
}
|
}
|
else {
|
// Padding is even, so xRC corresponds to a single texel.
|
mainLoop += `
|
if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xC, d1);
|
if (xC + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
xC${colIndex} = xTexelC${colIndex};
|
`;
|
}
|
if (colIndex + 1 < filterWidth) {
|
// If dilation is even, the second entry should match the first
|
// (either both are composed or both are single samples). But if
|
// dilation is odd, then the second entry should be the opposite
|
// of the first (if the first is composed, the second is a single
|
// sample, and vice versa.)
|
const nextTexelOffset = padLeft % 2 === 0 ?
|
util.nearestLargerEven(dilationWidth) :
|
dilationWidth;
|
if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) ||
|
(dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) {
|
mainLoop += `
|
xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset};
|
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
`;
|
// If dilation > 1 then the xRC's will not be able to share any
|
// values, so each xRC will require two unique calls to getX.
|
if (dilationWidth > 1) {
|
mainLoop += `
|
xCOffset -= 2;
|
if (xCOffset >= 0 && xCOffset < inDims[1]) {
|
previous = getX(batch, xR, xCOffset, d1);
|
xC${colIndex + 1} = vec4(previous.zw, xTexelC${colIndex + 1}.xy);
|
} else {
|
xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${colIndex + 1}.xy);
|
}
|
`;
|
}
|
else {
|
mainLoop += `
|
xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.xy);
|
`;
|
}
|
}
|
else {
|
// If dilation is 1 and padding is odd, we have already read the
|
// texel when constructing the previous x value. Here we can
|
// simply skip the texture read.
|
if (nextTexelOffset === 1) {
|
mainLoop += `
|
xC${colIndex + 1} = xTexelC${colIndex};
|
`;
|
}
|
else {
|
mainLoop += `
|
xCOffset = xC + ${nextTexelOffset};
|
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex + 1} = xTexelC${colIndex + 1};
|
`;
|
}
|
}
|
}
|
}
|
}
|
else { // stride === 2
|
if (colIndex < filterWidth) {
|
// Depending on whether padLeft is even or odd, we want either the
|
// xy or zw channels from X texels for xC${colIndex}. If padLeft is
|
// even, xC${colIndex +1} is simply the zw channels of texels we've
|
// already sampled. But if padLeft is odd, xC{$c + 1}.zw will
|
// need to come from the xy channels of a new texel, hence the `
|
// vec4
|
// final` initialized below.
|
if (padLeft % 2 === 1) {
|
mainLoop += `
|
xCOffset = xC + 1 - strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1);
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xC + 2 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw);
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
final = vec4(0.0);
|
xCOffset = xC + 1 + strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1]) {
|
final = getX(batch, xR, xCOffset, d1);
|
}
|
xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy);
|
`;
|
}
|
}
|
else {
|
mainLoop += `
|
if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xC, d1);
|
if (xC + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
xCOffset = xC + strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex} = vec4(
|
xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy);
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw);
|
`;
|
}
|
}
|
}
|
}
|
// localize the dotProd accumulation within the loop, the theory is for
|
// GPU with limited cache, accumulate sum across large amount of
|
// veriables will cause lots of cache misses. (i.e. 5x5 filter will have
|
// 50 variables)
|
if (colIndex < filterWidth) {
|
mainLoop += `
|
wTexel = getW(r, ${colIndex}, d1, d2);
|
dotProd += xC${colIndex}.xxzz * vec4(wTexel.xy, wTexel.xy);
|
if(d1 + 1 < ${convInfo.inChannels}) {
|
dotProd += xC${colIndex}.yyww * vec4(wTexel.zw, wTexel.zw);
|
}
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
wTexel = getW(r, ${colIndex + 1}, d1, d2);
|
dotProd += xC${colIndex + 1}.xxzz * vec4(wTexel.xy, wTexel.xy);
|
if(d1 + 1 < ${convInfo.inChannels}) {
|
dotProd += xC${colIndex + 1}.yyww * vec4(wTexel.zw, wTexel.zw);
|
}
|
`;
|
}
|
}
|
}
|
mainLoop += `
|
}
|
`;
|
mainLoop += `
|
}
|
`;
|
mainLoop += `
|
}
|
`;
|
let activationSnippet = '', applyActivationSnippet = '';
|
if (activation) {
|
if (hasPreluActivation) {
|
activationSnippet = `vec4 activation(vec4 a) {
|
vec4 b = getPreluActivationWeightsAtOutCoords();
|
${activation}
|
}`;
|
}
|
else if (hasLeakyReluAlpha) {
|
activationSnippet = `vec4 activation(vec4 a) {
|
vec4 b = getLeakyreluAlphaAtOutCoords();
|
${activation}
|
}`;
|
}
|
else {
|
activationSnippet = `vec4 activation(vec4 x) {
|
${activation}
|
}`;
|
}
|
applyActivationSnippet = `result = activation(result);`;
|
}
|
const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : '';
|
if (addBias) {
|
this.variableNames.push('bias');
|
}
|
if (hasPreluActivation) {
|
this.variableNames.push('preluActivationWeights');
|
}
|
if (hasLeakyReluAlpha) {
|
this.variableNames.push('leakyreluAlpha');
|
}
|
this.userCode = `
|
${activationSnippet}
|
|
void main() {
|
ivec4 coords = getOutputCoords();
|
int batch = coords.x;
|
ivec2 xRCCorner = coords.yz * strides - pads;
|
int d2 = coords.w;
|
int xRCorner = xRCCorner.x;
|
int xCCorner = xRCCorner.y;
|
|
//intialize dotProd with a small epsilon seems to reduce GPU accuracy loss.
|
vec4 dotProd = vec4(0.000000000000001);
|
|
${mainLoop}
|
|
vec4 result = dotProd - vec4(0.000000000000001);
|
${addBiasSnippet}
|
${applyActivationSnippet}
|
setOutput(result);
|
}
|
`;
|
}
|
}
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"conv_packed_gpu.js","sourceRoot":"","sources":["../../../../../tfjs-backend-webgl/src/conv_packed_gpu.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEF,OAAO,EAAe,IAAI,EAAC,MAAM,uBAAuB,CAAC;AAEzD,OAAO,EAAe,gBAAgB,EAAC,MAAM,cAAc,CAAC;AAE5D,MAAM,OAAO,mBAAmB;IAc9B,YACI,QAAiC,EAAE,OAAO,GAAG,KAAK,EAClD,aAAqB,IAAI,EAAE,kBAAkB,GAAG,KAAK,EACrD,iBAAiB,GAAG,KAAK;QAhB7B,kBAAa,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;QAC3B,iBAAY,GAAG,IAAI,CAAC;QACpB,iBAAY,GAAG,IAAI,CAAC;QAIpB,mBAAc,GAAG;YACf,EAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,OAAgB,EAAE;YACvC,EAAC,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC1C,EAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC5C,EAAC,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAgB,EAAE;SAC1C,CAAC;QAMA,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC;QACrC,IAAI,CAAC,mBAAmB,GAAG,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACrE,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC;QACtC,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,aAAa,CAAC;QAC7C,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC3C,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,YAAY,GAAG,WAAW,CAAC;QAEjC,IAAI,QAAQ,GAAG;;+CAE2B,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;yBACM,CAAC,GAAG,CAAC;wBACN,CAAC,GAAG,CAAC;yBACJ,CAAC,GAAG,CAAC,GAAG,CAAC;wBACV,CAAC,GAAG,CAAC,GAAG,CAAC;oBACb,CAAC,GAAG,CAAC;SACnB;QAED;;;;;;;WAOG;QACH,QAAQ,IAAI;2BACU,YAAY;8BACT,QAAQ,CAAC,UAAU;QACzC,CAAC;QACJ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;oBACC,CAAC,GAAG,CAAC;oBACL,CAAC,GAAG,CAAC;oBACL,CAAC,GAAG,CAAC,GAAG,CAAC;oBACT,CAAC,GAAG,CAAC,GAAG,CAAC;eACd,CAAC,eAAe,CAAC;SAC1B;QACD,QAAQ,IAAI;;;QAGT,CAAC;QAEJ,KAAK,IAAI,MAAM,GAAG,CAAC,EAAE,MAAM,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,MAAM,EAAE,EAAE;YAC9D,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;YAE5B,QAAQ,IAAI;6BACU,QAAQ,GAAG,aAAa;YACzC,CAAC;YAEN,IAAI,WAAW,KAAK,CAAC,EAAE;gBACrB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,2DAA2D;oBAC3D,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,kEAAkE;wBAClE,+DAA+D;wBAC/D,sDAAsD;wBAEtD,uBAAuB;wBACvB,6DAA6D;wBAC7D,qBAAqB;wBACrB,gDAAgD;wBAEhD,QAAQ,IAAI;;uEAGR,QAAQ;4BACG,QAAQ;;;;;8BAKN,QAAQ;;4BAEV,QAAQ;;gBAEpB,CAAC;wBACJ,iEAAiE;wBACjE,QAAQ;wBACR,IAAI,aAAa,KAAK,CAAC,IAAI,QAAQ,GAAG,CAAC,EAAE;4BACvC,QAAQ,IAAI;qBACN,QAAQ,kBAAkB,QAAQ,GAAG,CAAC,eACxC,QAAQ;kBACT,CAAC;yBACL;6BAAM;4BACL,QAAQ,IAAI;;;;;;;;;;;;yBAYF,QAAQ,+BAA+B,QAAQ;;yBAE/C,QAAQ,4BAA4B,QAAQ;;oBAEjD,CAAC;yBACP;qBACF;yBAAM;wBACL,yDAAyD;wBACzD,QAAQ,IAAI;2DACkC,QAAQ;4BACvC,QAAQ;;8BAEN,QAAQ;;4BAEV,QAAQ;;;qBAGf,QAAQ,aAAa,QAAQ;kBAChC,CAAC;qBACP;oBAED,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;wBAC9B,+DAA+D;wBAC/D,gEAAgE;wBAChE,gEAAgE;wBAChE,iEAAiE;wBACjE,2BAA2B;wBAE3B,MAAM,eAAe,GAAG,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;4BACvC,IAAI,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC,CAAC;4BACvC,aAAa,CAAC;wBAElB,IAAI,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC;4BAC9C,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE;4BAClD,QAAQ,IAAI;wDAC6B,eAAe;;yEAGpD,QAAQ,GAAG,CAAC;8BACD,QAAQ,GAAG,CAAC;;;;;gCAKV,QAAQ,GAAG,CAAC;;8BAEd,QAAQ,GAAG,CAAC;;oBAEtB,CAAC;4BAEN,+DAA+D;4BAC/D,6DAA6D;4BAC7D,IAAI,aAAa,GAAG,CAAC,EAAE;gCACrB,QAAQ,IAAI;;;;0BAIH,QAAQ,GAAG,CAAC,+BACd,QAAQ,GAAG,CAAC;;0BAEV,QAAQ,GAAG,CAAC,4BACd,QAAQ,GAAG,CAAC;;sBAEd,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;yBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;sBACX,CAAC;6BACP;yBAEF;6BAAM;4BACL,gEAAgE;4BAChE,4DAA4D;4BAC5D,gCAAgC;4BAChC,IAAI,eAAe,KAAK,CAAC,EAAE;gCACzB,QAAQ,IAAI;yBACJ,QAAQ,GAAG,CAAC,aAAa,QAAQ;sBACpC,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;uCACU,eAAe;;2EAGjC,QAAQ,GAAG,CAAC;gCACD,QAAQ,GAAG,CAAC;;kCAEV,QAAQ,GAAG,CAAC;;gCAEd,QAAQ,GAAG,CAAC;;;yBAGnB,QAAQ,GAAG,CAAC,aAAa,QAAQ,GAAG,CAAC;sBACxC,CAAC;6BACP;yBACF;qBACF;iBACF;aACF;iBAAM,EAAG,eAAe;gBACvB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,kEAAkE;oBAClE,mEAAmE;oBACnE,mEAAmE;oBACnE,6DAA6D;oBAC7D,gEAAgE;oBAChE,OAAO;oBACP,4BAA4B;oBAC5B,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,QAAQ,IAAI;;sEAGR,QAAQ;4BACG,QAAQ;;;;8BAIN,QAAQ;;4BAEV,QAAQ;;;kEAInB,QAAQ,GAAG,CAAC;4BACD,QAAQ,GAAG,CAAC;;;;8BAIV,QAAQ,GAAG,CAAC;;4BAEd,QAAQ,GAAG,CAAC;;;qBAGnB,QAAQ,kBAAkB,QAAQ,eACtC,QAAQ,GAAG,CAAC;gBACb,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;;;;;;uBAMJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,GAAG,CAAC;kBAC/C,CAAC;yBACL;qBACF;yBAAM;wBACL,QAAQ,IAAI;0DACiC,QAAQ;4BACtC,QAAQ;;8BAEN,QAAQ;;4BAEV,QAAQ;;;;sEAKnB,QAAQ,GAAG,CAAC;4BACD,QAAQ,GAAG,CAAC;;8BAEV,QAAQ,GAAG,CAAC;;4BAEd,QAAQ,GAAG,CAAC;;;qBAGnB,QAAQ;4BACD,QAAQ,eAAe,QAAQ,GAAG,CAAC;gBAC/C,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;uBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;kBACb,CAAC;yBACL;qBACF;iBACF;aACF;YAED,uEAAuE;YACvE,gEAAgE;YAChE,wEAAwE;YACxE,gBAAgB;YAChB,IAAI,QAAQ,GAAG,WAAW,EAAE;gBAC1B,QAAQ,IAAI;gCACW,QAAQ;4BACZ,QAAQ;2BACT,QAAQ,CAAC,UAAU;8BAChB,QAAQ;;YAE1B,CAAC;gBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;oBAC9B,QAAQ,IAAI;kCACW,QAAQ,GAAG,CAAC;8BAChB,QAAQ,GAAG,CAAC;6BACb,QAAQ,CAAC,UAAU;gCAChB,QAAQ,GAAG,CAAC;;cAE9B,CAAC;iBACL;aACF;SACF;QACD,QAAQ,IAAI;;IAEb,CAAC;QACF,QAAQ,IAAI;;IAEX,CAAC;QACF,QAAQ,IAAI;;IAEX,CAAC;QAEA,IAAI,iBAAiB,GAAG,EAAE,EAAE,sBAAsB,GAAG,EAAE,CAAC;QACxD,IAAI,UAAU,EAAE;YACd,IAAI,kBAAkB,EAAE;gBACtB,iBAAiB,GAAG;;aAEhB,UAAU;WACZ,CAAC;aACJ;iBAAM,IAAI,iBAAiB,EAAE;gBAC5B,iBAAiB,GAAG;;aAEhB,UAAU;WACZ,CAAC;aACJ;iBAAM;gBACL,iBAAiB,GAAG;aAChB,UAAU;WACZ,CAAC;aACJ;YAED,sBAAsB,GAAG,8BAA8B,CAAC;SACzD;QAED,MAAM,cAAc,GAAG,OAAO,CAAC,CAAC,CAAC,iCAAiC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxE,IAAI,OAAO,EAAE;YACX,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;SACjC;QAED,IAAI,kBAAkB,EAAE;YACtB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;SACnD;QACD,IAAI,iBAAiB,EAAE;YACrB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;SAC3C;QAED,IAAI,CAAC,QAAQ,GAAG;SACZ,iBAAiB;;;;;;;;;;;;;WAaf,QAAQ;;;WAGR,cAAc;WACd,sBAAsB;;;MAG3B,CAAC;IACJ,CAAC;CACF","sourcesContent":["/**\n * @license\n * Copyright 2022 Google LLC. All Rights Reserved.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n import {backend_util, util} from '@tensorflow/tfjs-core';\n\n import {GPGPUProgram, useShapeUniforms} from './gpgpu_math';\n\n export class Conv2DPackedProgram implements GPGPUProgram {\n   variableNames = ['x', 'W'];\n   packedInputs = true;\n   packedOutput = true;\n   outputShape: number[];\n   userCode: string;\n   enableShapeUniforms: boolean;\n   customUniforms = [\n     {name: 'pads', type: 'ivec2' as const },\n     {name: 'strides', type: 'ivec2' as const },\n     {name: 'dilations', type: 'ivec2' as const },\n     {name: 'inDims', type: 'ivec2' as const },\n   ];\n\n   constructor(\n       convInfo: backend_util.Conv2DInfo, addBias = false,\n       activation: string = null, hasPreluActivation = false,\n       hasLeakyReluAlpha = false) {\n     this.outputShape = convInfo.outShape;\n     this.enableShapeUniforms = useShapeUniforms(this.outputShape.length);\n     const padLeft = convInfo.padInfo.left;\n     const strideWidth = convInfo.strideWidth;\n     const dilationWidth = convInfo.dilationWidth;\n     const filterHeight = convInfo.filterHeight;\n     const filterWidth = convInfo.filterWidth;\n     const texelsAcross = filterWidth;\n\n     let mainLoop = `\n       int xR; int xC; int xCOffset;\n       vec4 wTexel; vec4 previous; vec4 final;`;\n\n     for (let c = 0; c < filterWidth; c++) {\n       mainLoop += `\n           vec4 xTexelC${c * 2};\n           int xTexelC${c * 2}Ready;\n           vec4 xTexelC${c * 2 + 1};\n           int xTexelC${c * 2 + 1}Ready;\n           vec4 xC${c};`;\n     }\n\n     /**\n      * This vectorized implementation works by gathering the values needed for\n      * each output channel's dot product into vec4's and then multiplying them\n      * all together (this happens in the final double for-loop below). Most of\n      * the main loop consists of constructing these vec4's with the minimum\n      * number of texture2D calls, which means making use of all four returned\n      * values from a texture2D call at once.\n      */\n     mainLoop += `\n     for (int r = 0; r < ${filterHeight}; r++) {\n      for (int d1 = 0; d1 < ${convInfo.inChannels}; d1 += 2) {\n       `;\n     for (let c = 0; c < filterWidth; c++) {\n       mainLoop += `\n           xTexelC${c * 2} = vec4(0.0);\n           xTexelC${c * 2}Ready = 0;\n           xTexelC${c * 2 + 1} = vec4(0.0);\n           xTexelC${c * 2 + 1}Ready = 0;\n           xC${c} = vec4(0.0);`;\n     }\n     mainLoop += `\n         xR = xRCorner + r * dilations[0];\n         if (xR >=0 && xR < inDims[0]) {\n       `;\n\n     for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) {\n       const colIndex = texelC * 2;\n\n       mainLoop += `\n           xC = xCCorner + ${colIndex * dilationWidth};\n           `;\n\n       if (strideWidth === 1) {\n         if (colIndex < filterWidth) {\n           // If padding is odd, the outer texels have to be composed.\n           if (padLeft % 2 === 1) {\n             // TODO: Ensure vec4 previous does not result in redundant sample,\n             // and avoid setting xTexelRC's that exceed the boundary in the\n             // first place rather than resetting them to vec4(0)).\n\n             // To compute xCOffset:\n             // - If padding is odd, we must add 1 to ensure we ask for an\n             // even-numbered row.\n             // - We subtract 2 to access the previous texel.\n\n             mainLoop += `\n                 xCOffset = xC + 1;\n                 if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n               `;\n             // This texel has been read in previous iteration if the dilation\n             // is 1.\n             if (dilationWidth === 1 && colIndex > 0) {\n               mainLoop += `\n                 xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${\n                   colIndex}.xy);\n                 `;\n             } else {\n               mainLoop += `\n                   xCOffset = xC + 1 - 2;\n\n                   if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                     previous = getX(batch, xR, xCOffset, d1);\n\n                     // Need to manually clear unused channels in case\n                     // we're reading from recycled texture.\n                     if (xCOffset + 1 >= inDims[1]) {\n                       previous.zw = vec2(0.0);\n                     }\n\n                     xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy);\n                   } else {\n                     xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy);\n                   }\n                   `;\n             }\n           } else {\n             // Padding is even, so xRC corresponds to a single texel.\n             mainLoop += `\n                 if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                   if (xC + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 xC${colIndex} = xTexelC${colIndex};\n                 `;\n           }\n\n           if (colIndex + 1 < filterWidth) {\n             // If dilation is even, the second entry should match the first\n             // (either both are composed or both are single samples). But if\n             // dilation is odd, then the second entry should be the opposite\n             // of the first (if the first is composed, the second is a single\n             // sample, and vice versa.)\n\n             const nextTexelOffset = padLeft % 2 === 0 ?\n                 util.nearestLargerEven(dilationWidth) :\n                 dilationWidth;\n\n             if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) ||\n                 (dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) {\n               mainLoop += `\n                   xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset};\n\n                   if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                   colIndex + 1}Ready == 0) {\n                     xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n\n                     // Need to manually clear unused channels in case\n                     // we're reading from recycled texture.\n                     if (xCOffset + 1 >= inDims[1]) {\n                       xTexelC${colIndex + 1}.zw = vec2(0.0);\n                     }\n                     xTexelC${colIndex + 1}Ready = 1;\n                   }\n                   `;\n\n               // If dilation > 1 then the xRC's will not be able to share any\n               // values, so each xRC will require two unique calls to getX.\n               if (dilationWidth > 1) {\n                 mainLoop += `\n                     xCOffset -= 2;\n                     if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                      previous = getX(batch, xR, xCOffset, d1);\n                      xC${colIndex + 1} = vec4(previous.zw, xTexelC${\n                        colIndex + 1}.xy);\n                     } else {\n                      xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${\n                        colIndex + 1}.xy);\n                     }\n                     `;\n               } else {\n                 mainLoop += `\n                     xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                     colIndex + 1}.xy);\n                     `;\n               }\n\n             } else {\n               // If dilation is 1 and padding is odd, we have already read the\n               // texel when constructing the previous x value. Here we can\n               // simply skip the texture read.\n               if (nextTexelOffset === 1) {\n                 mainLoop += `\n                     xC${colIndex + 1} = xTexelC${colIndex};\n                     `;\n               } else {\n                 mainLoop += `\n                     xCOffset = xC + ${nextTexelOffset};\n\n                     if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                     colIndex + 1}Ready == 0) {\n                       xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                       if (xCOffset + 1 >= inDims[1]) {\n                         xTexelC${colIndex + 1}.zw = vec2(0.0);\n                       }\n                       xTexelC${colIndex + 1}Ready = 1;\n                     }\n\n                     xC${colIndex + 1} = xTexelC${colIndex + 1};\n                     `;\n               }\n             }\n           }\n         }\n       } else {  // stride === 2\n         if (colIndex < filterWidth) {\n           // Depending on whether padLeft is even or odd, we want either the\n           // xy or zw channels from X texels for xC${colIndex}. If padLeft is\n           // even, xC${colIndex +1} is simply the zw channels of texels we've\n           // already sampled. But if padLeft is odd, xC{$c + 1}.zw will\n           // need to come from the xy channels of a new texel, hence the `\n           // vec4\n           // final` initialized below.\n           if (padLeft % 2 === 1) {\n             mainLoop += `\n                 xCOffset = xC + 1 - strides[1];\n                 if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${\n                 colIndex + 1}Ready == 0) {\n                   xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1);\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xC + 2 >= inDims[1]) {\n                     xTexelC${colIndex + 1}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex + 1}Ready = 1;\n                 }\n\n                 xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                 colIndex + 1}.zw);\n               `;\n\n             if (colIndex + 1 < filterWidth) {\n               mainLoop += `\n                   final = vec4(0.0);\n                   xCOffset = xC + 1 + strides[1];\n                   if(xCOffset >= 0 && xCOffset < inDims[1]) {\n                     final = getX(batch, xR, xCOffset, d1);\n                   }\n                   xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy);\n                 `;\n             }\n           } else {\n             mainLoop += `\n                 if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                   if (xC + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 xCOffset = xC + strides[1];\n                 if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex + 1}Ready == 0) {\n                   xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex + 1}.zw = vec2(0.);\n                   }\n                   xTexelC${colIndex + 1}Ready = 1;\n                 }\n\n                 xC${colIndex} = vec4(\n                   xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy);\n               `;\n\n             if (colIndex + 1 < filterWidth) {\n               mainLoop += `\n                   xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                   colIndex + 1}.zw);\n                 `;\n             }\n           }\n         }\n       }\n\n       // localize the dotProd accumulation within the loop, the theory is for\n       // GPU with limited cache, accumulate sum across large amount of\n       // veriables will cause lots of cache misses. (i.e. 5x5 filter will have\n       // 50 variables)\n       if (colIndex < filterWidth) {\n         mainLoop += `\n             wTexel = getW(r, ${colIndex}, d1, d2);\n             dotProd += xC${colIndex}.xxzz * vec4(wTexel.xy, wTexel.xy);\n             if(d1 + 1 < ${convInfo.inChannels}) {\n               dotProd += xC${colIndex}.yyww * vec4(wTexel.zw, wTexel.zw);\n             }\n           `;\n\n         if (colIndex + 1 < filterWidth) {\n           mainLoop += `\n               wTexel = getW(r, ${colIndex + 1}, d1, d2);\n               dotProd += xC${colIndex + 1}.xxzz * vec4(wTexel.xy, wTexel.xy);\n               if(d1 + 1 < ${convInfo.inChannels}) {\n                 dotProd += xC${colIndex + 1}.yyww * vec4(wTexel.zw, wTexel.zw);\n               }\n             `;\n         }\n       }\n     }\n     mainLoop += `\n     }\n   `;\n   mainLoop += `\n     }\n   `;\n   mainLoop += `\n     }\n   `;\n\n     let activationSnippet = '', applyActivationSnippet = '';\n     if (activation) {\n       if (hasPreluActivation) {\n         activationSnippet = `vec4 activation(vec4 a) {\n           vec4 b = getPreluActivationWeightsAtOutCoords();\n           ${activation}\n         }`;\n       } else if (hasLeakyReluAlpha) {\n         activationSnippet = `vec4 activation(vec4 a) {\n           vec4 b = getLeakyreluAlphaAtOutCoords();\n           ${activation}\n         }`;\n       } else {\n         activationSnippet = `vec4 activation(vec4 x) {\n           ${activation}\n         }`;\n       }\n\n       applyActivationSnippet = `result = activation(result);`;\n     }\n\n     const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : '';\n     if (addBias) {\n       this.variableNames.push('bias');\n     }\n\n     if (hasPreluActivation) {\n       this.variableNames.push('preluActivationWeights');\n     }\n     if (hasLeakyReluAlpha) {\n       this.variableNames.push('leakyreluAlpha');\n     }\n\n     this.userCode = `\n       ${activationSnippet}\n\n       void main() {\n         ivec4 coords = getOutputCoords();\n         int batch = coords.x;\n         ivec2 xRCCorner = coords.yz * strides - pads;\n         int d2 = coords.w;\n         int xRCorner = xRCCorner.x;\n         int xCCorner = xRCCorner.y;\n\n         //intialize dotProd with a small epsilon seems to reduce GPU accuracy loss.\n         vec4 dotProd = vec4(0.000000000000001);\n\n         ${mainLoop}\n\n         vec4 result = dotProd - vec4(0.000000000000001);\n         ${addBiasSnippet}\n         ${applyActivationSnippet}\n         setOutput(result);\n       }\n     `;\n   }\n }\n"]}
|