/**
|
* @license
|
* Copyright 2018 Google LLC. All Rights Reserved.
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
* you may not use this file except in compliance with the License.
|
* You may obtain a copy of the License at
|
*
|
* http://www.apache.org/licenses/LICENSE-2.0
|
*
|
* Unless required by applicable law or agreed to in writing, software
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* See the License for the specific language governing permissions and
|
* limitations under the License.
|
* =============================================================================
|
*/
|
import { util } from '@tensorflow/tfjs-core';
|
import { useShapeUniforms } from './gpgpu_math';
|
export class DepthwiseConvPacked2DProgram {
|
constructor(convInfo, addBias = false, activation = null, hasPreluActivation = false, hasLeakyReluAlpha = false) {
|
this.variableNames = ['x', 'W'];
|
this.packedInputs = true;
|
this.packedOutput = true;
|
this.customUniforms = [
|
{ name: 'pads', type: 'ivec2' },
|
{ name: 'strides', type: 'ivec2' },
|
{ name: 'dilations', type: 'ivec2' },
|
{ name: 'inDims', type: 'ivec2' },
|
];
|
this.outputShape = convInfo.outShape;
|
this.enableShapeUniforms = useShapeUniforms(this.outputShape.length);
|
const channelMul = convInfo.outChannels / convInfo.inChannels;
|
const padLeft = convInfo.padInfo.left;
|
const strideWidth = convInfo.strideWidth;
|
const dilationWidth = convInfo.dilationWidth;
|
const filterHeight = convInfo.filterHeight;
|
const filterWidth = convInfo.filterWidth;
|
const texelsAcross = filterWidth;
|
let mainLoop = `
|
int xR; int xC; int xCOffset;
|
vec4 wTexel; vec4 previous; vec4 final;`;
|
for (let c = 0; c < filterWidth; c++) {
|
mainLoop += `
|
vec4 xTexelC${c * 2};
|
int xTexelC${c * 2}Ready;
|
vec4 xTexelC${c * 2 + 1};
|
int xTexelC${c * 2 + 1}Ready;
|
vec4 xC${c};`;
|
}
|
/**
|
* This vectorized implementation works by gathering the values needed for
|
* each output channel's dot product into vec4's and then multiplying them
|
* all together (this happens in the final double for-loop below). Most of
|
* the main loop consists of constructing these vec4's with the minimum
|
* number of texture2D calls, which means making use of all four returned
|
* values from a texture2D call at once.
|
*/
|
mainLoop += `
|
for (int r = 0; r < ${filterHeight}; r++) {
|
`;
|
for (let c = 0; c < filterWidth; c++) {
|
mainLoop += `
|
xTexelC${c * 2} = vec4(0.0);
|
xTexelC${c * 2}Ready = 0;
|
xTexelC${c * 2 + 1} = vec4(0.0);
|
xTexelC${c * 2 + 1}Ready = 0;
|
xC${c} = vec4(0.0);`;
|
}
|
mainLoop += `
|
xR = xRCorner + r * dilations[0];
|
if (xR >=0 && xR < inDims[0]) {
|
`;
|
for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) {
|
const colIndex = texelC * 2;
|
mainLoop += `
|
xC = xCCorner + ${colIndex * dilationWidth};
|
`;
|
if (strideWidth === 1) {
|
if (colIndex < filterWidth) {
|
// If padding is odd, the outer texels have to be composed.
|
if (padLeft % 2 === 1) {
|
// TODO: Ensure vec4 previous does not result in redundant sample,
|
// and avoid setting xTexelRC's that exceed the boundary in the
|
// first place rather than resetting them to vec4(0)).
|
// To compute xCOffset:
|
// - If padding is odd, we must add 1 to ensure we ask for an
|
// even-numbered row.
|
// - We subtract 2 to access the previous texel.
|
mainLoop += `
|
xCOffset = xC + 1;
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
`;
|
// This texel has been read in previous iteration if the dilation
|
// is 1.
|
if (dilationWidth === 1 && colIndex > 0) {
|
mainLoop += `
|
xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${colIndex}.xy);
|
`;
|
}
|
else {
|
mainLoop += `
|
xCOffset = xC + 1 - 2;
|
|
if (xCOffset >= 0 && xCOffset < inDims[1]) {
|
previous = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
previous.zw = vec2(0.0);
|
}
|
|
xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy);
|
} else {
|
xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy);
|
}
|
`;
|
}
|
}
|
else {
|
// Padding is even, so xRC corresponds to a single texel.
|
mainLoop += `
|
if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xC, d1);
|
if (xC + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
xC${colIndex} = xTexelC${colIndex};
|
`;
|
}
|
if (colIndex + 1 < filterWidth) {
|
// If dilation is even, the second entry should match the first
|
// (either both are composed or both are single samples). But if
|
// dilation is odd, then the second entry should be the opposite
|
// of the first (if the first is composed, the second is a single
|
// sample, and vice versa.)
|
const nextTexelOffset = padLeft % 2 === 0 ?
|
util.nearestLargerEven(dilationWidth) :
|
dilationWidth;
|
if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) ||
|
(dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) {
|
mainLoop += `
|
xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset};
|
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
`;
|
// If dilation > 1 then the xRC's will not be able to share any
|
// values, so each xRC will require two unique calls to getX.
|
if (dilationWidth > 1) {
|
mainLoop += `
|
xCOffset -= 2;
|
if (xCOffset >= 0 && xCOffset < inDims[1]) {
|
previous = getX(batch, xR, xCOffset, d1);
|
xC${colIndex + 1} = vec4(previous.zw, xTexelC${colIndex + 1}.xy);
|
} else {
|
xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${colIndex + 1}.xy);
|
}
|
`;
|
}
|
else {
|
mainLoop += `
|
xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.xy);
|
`;
|
}
|
}
|
else {
|
// If dilation is 1 and padding is odd, we have already read the
|
// texel when constructing the previous x value. Here we can
|
// simply skip the texture read.
|
if (nextTexelOffset === 1) {
|
mainLoop += `
|
xC${colIndex + 1} = xTexelC${colIndex};
|
`;
|
}
|
else {
|
mainLoop += `
|
xCOffset = xC + ${nextTexelOffset};
|
|
if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex + 1} = xTexelC${colIndex + 1};
|
`;
|
}
|
}
|
}
|
}
|
}
|
else { // stride === 2
|
if (colIndex < filterWidth) {
|
// Depending on whether padLeft is even or odd, we want either the
|
// xy or zw channels from X texels for xC${colIndex}. If padLeft is
|
// even, xC${colIndex +1} is simply the zw channels of texels we've
|
// already sampled. But if padLeft is odd, xC{$c + 1}.zw will
|
// need to come from the xy channels of a new texel, hence the `
|
// vec4
|
// final` initialized below.
|
if (padLeft % 2 === 1) {
|
mainLoop += `
|
xCOffset = xC + 1 - strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1);
|
// Need to manually clear unused channels in case
|
// we're reading from recycled texture.
|
if (xC + 2 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw);
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
final = vec4(0.0);
|
xCOffset = xC + 1 + strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1]) {
|
final = getX(batch, xR, xCOffset, d1);
|
}
|
xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy);
|
`;
|
}
|
}
|
else {
|
mainLoop += `
|
if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {
|
xTexelC${colIndex} = getX(batch, xR, xC, d1);
|
if (xC + 1 >= inDims[1]) {
|
xTexelC${colIndex}.zw = vec2(0.0);
|
}
|
xTexelC${colIndex}Ready = 1;
|
}
|
|
xCOffset = xC + strides[1];
|
if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) {
|
xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);
|
if (xCOffset + 1 >= inDims[1]) {
|
xTexelC${colIndex + 1}.zw = vec2(0.);
|
}
|
xTexelC${colIndex + 1}Ready = 1;
|
}
|
|
xC${colIndex} = vec4(
|
xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy);
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw);
|
`;
|
}
|
}
|
}
|
}
|
// localize the dotProd accumulation within the loop, the theory is for
|
// GPU with limited cache, accumulate sum across large amount of
|
// veriables will cause lots of cache misses. (i.e. 5x5 filter will have
|
// 50 variables)
|
if (colIndex < filterWidth) {
|
mainLoop += `
|
wTexel = getW(r, ${colIndex}, d1, q);
|
dotProd += xC${colIndex} * vec4(wTexel.xz, wTexel.xz);
|
`;
|
if (colIndex + 1 < filterWidth) {
|
mainLoop += `
|
wTexel = getW(r, ${colIndex + 1}, d1, q);
|
dotProd += xC${colIndex + 1} * vec4(wTexel.xz, wTexel.xz);
|
`;
|
}
|
}
|
}
|
mainLoop += `
|
}
|
`;
|
mainLoop += `
|
}
|
`;
|
let activationSnippet = '', applyActivationSnippet = '';
|
if (activation) {
|
if (hasPreluActivation) {
|
activationSnippet = `vec4 activation(vec4 a) {
|
vec4 b = getPreluActivationWeightsAtOutCoords();
|
${activation}
|
}`;
|
}
|
else if (hasLeakyReluAlpha) {
|
activationSnippet = `vec4 activation(vec4 a) {
|
vec4 b = getLeakyreluAlphaAtOutCoords();
|
${activation}
|
}`;
|
}
|
else {
|
activationSnippet = `vec4 activation(vec4 x) {
|
${activation}
|
}`;
|
}
|
applyActivationSnippet = `result = activation(result);`;
|
}
|
const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : '';
|
if (addBias) {
|
this.variableNames.push('bias');
|
}
|
if (hasPreluActivation) {
|
this.variableNames.push('preluActivationWeights');
|
}
|
if (hasLeakyReluAlpha) {
|
this.variableNames.push('leakyreluAlpha');
|
}
|
this.userCode = `
|
${activationSnippet}
|
|
void main() {
|
ivec4 coords = getOutputCoords();
|
int batch = coords.x;
|
ivec2 xRCCorner = coords.yz * strides - pads;
|
int d2 = coords.w;
|
int d1 = d2 / ${channelMul};
|
int q = d2 - d1 * ${channelMul};
|
int xRCorner = xRCCorner.x;
|
int xCCorner = xRCCorner.y;
|
|
//intialize dotProd with a small epsilon seems to reduce GPU accuracy loss.
|
vec4 dotProd = vec4(0.000000000000001);
|
|
${mainLoop}
|
|
vec4 result = dotProd - vec4(0.000000000000001);
|
${addBiasSnippet}
|
${applyActivationSnippet}
|
setOutput(result);
|
}
|
`;
|
}
|
}
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"conv_packed_gpu_depthwise.js","sourceRoot":"","sources":["../../../../../tfjs-backend-webgl/src/conv_packed_gpu_depthwise.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EAAe,IAAI,EAAC,MAAM,uBAAuB,CAAC;AAEzD,OAAO,EAAe,gBAAgB,EAAC,MAAM,cAAc,CAAC;AAE5D,MAAM,OAAO,4BAA4B;IAcvC,YACI,QAAiC,EAAE,OAAO,GAAG,KAAK,EAClD,aAAqB,IAAI,EAAE,kBAAkB,GAAG,KAAK,EACrD,iBAAiB,GAAG,KAAK;QAhB7B,kBAAa,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;QAC3B,iBAAY,GAAG,IAAI,CAAC;QACpB,iBAAY,GAAG,IAAI,CAAC;QAIpB,mBAAc,GAAG;YACf,EAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,OAAgB,EAAE;YACvC,EAAC,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC1C,EAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC5C,EAAC,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAgB,EAAE;SAC1C,CAAC;QAMA,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC;QACrC,IAAI,CAAC,mBAAmB,GAAG,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACrE,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,GAAG,QAAQ,CAAC,UAAU,CAAC;QAC9D,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC;QACtC,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,aAAa,CAAC;QAC7C,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC3C,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,YAAY,GAAG,WAAW,CAAC;QAEjC,IAAI,QAAQ,GAAG;;8CAE2B,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;wBACM,CAAC,GAAG,CAAC;uBACN,CAAC,GAAG,CAAC;wBACJ,CAAC,GAAG,CAAC,GAAG,CAAC;uBACV,CAAC,GAAG,CAAC,GAAG,CAAC;mBACb,CAAC,GAAG,CAAC;SACnB;QAED;;;;;;;WAOG;QACH,QAAQ,IAAI;0BACU,YAAY;OAC/B,CAAC;QACJ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;mBACC,CAAC,GAAG,CAAC;mBACL,CAAC,GAAG,CAAC;mBACL,CAAC,GAAG,CAAC,GAAG,CAAC;mBACT,CAAC,GAAG,CAAC,GAAG,CAAC;cACd,CAAC,eAAe,CAAC;SAC1B;QACD,QAAQ,IAAI;;;OAGT,CAAC;QAEJ,KAAK,IAAI,MAAM,GAAG,CAAC,EAAE,MAAM,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,MAAM,EAAE,EAAE;YAC9D,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;YAE5B,QAAQ,IAAI;4BACU,QAAQ,GAAG,aAAa;WACzC,CAAC;YAEN,IAAI,WAAW,KAAK,CAAC,EAAE;gBACrB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,2DAA2D;oBAC3D,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,kEAAkE;wBAClE,+DAA+D;wBAC/D,sDAAsD;wBAEtD,uBAAuB;wBACvB,6DAA6D;wBAC7D,qBAAqB;wBACrB,gDAAgD;wBAEhD,QAAQ,IAAI;;sEAGR,QAAQ;2BACG,QAAQ;;;;;6BAKN,QAAQ;;2BAEV,QAAQ;;eAEpB,CAAC;wBACJ,iEAAiE;wBACjE,QAAQ;wBACR,IAAI,aAAa,KAAK,CAAC,IAAI,QAAQ,GAAG,CAAC,EAAE;4BACvC,QAAQ,IAAI;oBACN,QAAQ,kBAAkB,QAAQ,GAAG,CAAC,eACxC,QAAQ;iBACT,CAAC;yBACL;6BAAM;4BACL,QAAQ,IAAI;;;;;;;;;;;;wBAYF,QAAQ,+BAA+B,QAAQ;;wBAE/C,QAAQ,4BAA4B,QAAQ;;mBAEjD,CAAC;yBACP;qBACF;yBAAM;wBACL,yDAAyD;wBACzD,QAAQ,IAAI;0DACkC,QAAQ;2BACvC,QAAQ;;6BAEN,QAAQ;;2BAEV,QAAQ;;;oBAGf,QAAQ,aAAa,QAAQ;iBAChC,CAAC;qBACP;oBAED,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;wBAC9B,+DAA+D;wBAC/D,gEAAgE;wBAChE,gEAAgE;wBAChE,iEAAiE;wBACjE,2BAA2B;wBAE3B,MAAM,eAAe,GAAG,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;4BACvC,IAAI,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC,CAAC;4BACvC,aAAa,CAAC;wBAElB,IAAI,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC;4BAC9C,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE;4BAClD,QAAQ,IAAI;uDAC6B,eAAe;;wEAGpD,QAAQ,GAAG,CAAC;6BACD,QAAQ,GAAG,CAAC;;;;;+BAKV,QAAQ,GAAG,CAAC;;6BAEd,QAAQ,GAAG,CAAC;;mBAEtB,CAAC;4BAEN,+DAA+D;4BAC/D,6DAA6D;4BAC7D,IAAI,aAAa,GAAG,CAAC,EAAE;gCACrB,QAAQ,IAAI;;;;yBAIH,QAAQ,GAAG,CAAC,+BACd,QAAQ,GAAG,CAAC;;yBAEV,QAAQ,GAAG,CAAC,4BACd,QAAQ,GAAG,CAAC;;qBAEd,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;wBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;qBACX,CAAC;6BACP;yBACF;6BAAM;4BACL,gEAAgE;4BAChE,4DAA4D;4BAC5D,gCAAgC;4BAChC,IAAI,eAAe,KAAK,CAAC,EAAE;gCACzB,QAAQ,IAAI;wBACJ,QAAQ,GAAG,CAAC,aAAa,QAAQ;qBACpC,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;sCACU,eAAe;;0EAGjC,QAAQ,GAAG,CAAC;+BACD,QAAQ,GAAG,CAAC;;iCAEV,QAAQ,GAAG,CAAC;;+BAEd,QAAQ,GAAG,CAAC;;;wBAGnB,QAAQ,GAAG,CAAC,aAAa,QAAQ,GAAG,CAAC;qBACxC,CAAC;6BACP;yBACF;qBACF;iBACF;aACF;iBAAM,EAAG,eAAe;gBACvB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,kEAAkE;oBAClE,mEAAmE;oBACnE,mEAAmE;oBACnE,6DAA6D;oBAC7D,gEAAgE;oBAChE,OAAO;oBACP,4BAA4B;oBAC5B,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,QAAQ,IAAI;;qEAGR,QAAQ;2BACG,QAAQ;;;;6BAIN,QAAQ;;2BAEV,QAAQ;;;iEAInB,QAAQ,GAAG,CAAC;2BACD,QAAQ,GAAG,CAAC;;;;6BAIV,QAAQ,GAAG,CAAC;;2BAEd,QAAQ,GAAG,CAAC;;;oBAGnB,QAAQ,kBAAkB,QAAQ,eACtC,QAAQ,GAAG,CAAC;eACb,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;;;;;;sBAMJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,GAAG,CAAC;iBAC/C,CAAC;yBACL;qBACF;yBAAM;wBACL,QAAQ,IAAI;yDACiC,QAAQ;2BACtC,QAAQ;;6BAEN,QAAQ;;2BAEV,QAAQ;;;;qEAKnB,QAAQ,GAAG,CAAC;2BACD,QAAQ,GAAG,CAAC;;6BAEV,QAAQ,GAAG,CAAC;;2BAEd,QAAQ,GAAG,CAAC;;;oBAGnB,QAAQ;2BACD,QAAQ,eAAe,QAAQ,GAAG,CAAC;eAC/C,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;sBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;iBACb,CAAC;yBACL;qBACF;iBACF;aACF;YAED,uEAAuE;YACvE,gEAAgE;YAChE,wEAAwE;YACxE,gBAAgB;YAChB,IAAI,QAAQ,GAAG,WAAW,EAAE;gBAC1B,QAAQ,IAAI;+BACW,QAAQ;2BACZ,QAAQ;WACxB,CAAC;gBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;oBAC9B,QAAQ,IAAI;iCACW,QAAQ,GAAG,CAAC;6BAChB,QAAQ,GAAG,CAAC;aAC5B,CAAC;iBACL;aACF;SACF;QACD,QAAQ,IAAI;;GAEb,CAAC;QACA,QAAQ,IAAI;;KAEX,CAAC;QAEF,IAAI,iBAAiB,GAAG,EAAE,EAAE,sBAAsB,GAAG,EAAE,CAAC;QACxD,IAAI,UAAU,EAAE;YACd,IAAI,kBAAkB,EAAE;gBACtB,iBAAiB,GAAG;;YAEhB,UAAU;UACZ,CAAC;aACJ;iBAAM,IAAI,iBAAiB,EAAE;gBAC5B,iBAAiB,GAAG;;YAEhB,UAAU;UACZ,CAAC;aACJ;iBAAM;gBACL,iBAAiB,GAAG;YAChB,UAAU;UACZ,CAAC;aACJ;YAED,sBAAsB,GAAG,8BAA8B,CAAC;SACzD;QAED,MAAM,cAAc,GAAG,OAAO,CAAC,CAAC,CAAC,iCAAiC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxE,IAAI,OAAO,EAAE;YACX,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;SACjC;QAED,IAAI,kBAAkB,EAAE;YACtB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;SACnD;QACD,IAAI,iBAAiB,EAAE;YACrB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;SAC3C;QAED,IAAI,CAAC,QAAQ,GAAG;QACZ,iBAAiB;;;;;;;wBAOD,UAAU;4BACN,UAAU;;;;;;;UAO5B,QAAQ;;;UAGR,cAAc;UACd,sBAAsB;;;KAG3B,CAAC;IACJ,CAAC;CACF","sourcesContent":["/**\n * @license\n * Copyright 2018 Google LLC. All Rights Reserved.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\nimport {backend_util, util} from '@tensorflow/tfjs-core';\n\nimport {GPGPUProgram, useShapeUniforms} from './gpgpu_math';\n\nexport class DepthwiseConvPacked2DProgram implements GPGPUProgram {\n  variableNames = ['x', 'W'];\n  packedInputs = true;\n  packedOutput = true;\n  outputShape: number[];\n  userCode: string;\n  enableShapeUniforms: boolean;\n  customUniforms = [\n    {name: 'pads', type: 'ivec2' as const },\n    {name: 'strides', type: 'ivec2' as const },\n    {name: 'dilations', type: 'ivec2' as const },\n    {name: 'inDims', type: 'ivec2' as const },\n  ];\n\n  constructor(\n      convInfo: backend_util.Conv2DInfo, addBias = false,\n      activation: string = null, hasPreluActivation = false,\n      hasLeakyReluAlpha = false) {\n    this.outputShape = convInfo.outShape;\n    this.enableShapeUniforms = useShapeUniforms(this.outputShape.length);\n    const channelMul = convInfo.outChannels / convInfo.inChannels;\n    const padLeft = convInfo.padInfo.left;\n    const strideWidth = convInfo.strideWidth;\n    const dilationWidth = convInfo.dilationWidth;\n    const filterHeight = convInfo.filterHeight;\n    const filterWidth = convInfo.filterWidth;\n    const texelsAcross = filterWidth;\n\n    let mainLoop = `\n      int xR; int xC; int xCOffset;\n      vec4 wTexel; vec4 previous; vec4 final;`;\n\n    for (let c = 0; c < filterWidth; c++) {\n      mainLoop += `\n          vec4 xTexelC${c * 2};\n          int xTexelC${c * 2}Ready;\n          vec4 xTexelC${c * 2 + 1};\n          int xTexelC${c * 2 + 1}Ready;\n          vec4 xC${c};`;\n    }\n\n    /**\n     * This vectorized implementation works by gathering the values needed for\n     * each output channel's dot product into vec4's and then multiplying them\n     * all together (this happens in the final double for-loop below). Most of\n     * the main loop consists of constructing these vec4's with the minimum\n     * number of texture2D calls, which means making use of all four returned\n     * values from a texture2D call at once.\n     */\n    mainLoop += `\n    for (int r = 0; r < ${filterHeight}; r++) {\n      `;\n    for (let c = 0; c < filterWidth; c++) {\n      mainLoop += `\n          xTexelC${c * 2} = vec4(0.0);\n          xTexelC${c * 2}Ready = 0;\n          xTexelC${c * 2 + 1} = vec4(0.0);\n          xTexelC${c * 2 + 1}Ready = 0;\n          xC${c} = vec4(0.0);`;\n    }\n    mainLoop += `\n        xR = xRCorner + r * dilations[0];\n        if (xR >=0 && xR < inDims[0]) {\n      `;\n\n    for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) {\n      const colIndex = texelC * 2;\n\n      mainLoop += `\n          xC = xCCorner + ${colIndex * dilationWidth};\n          `;\n\n      if (strideWidth === 1) {\n        if (colIndex < filterWidth) {\n          // If padding is odd, the outer texels have to be composed.\n          if (padLeft % 2 === 1) {\n            // TODO: Ensure vec4 previous does not result in redundant sample,\n            // and avoid setting xTexelRC's that exceed the boundary in the\n            // first place rather than resetting them to vec4(0)).\n\n            // To compute xCOffset:\n            // - If padding is odd, we must add 1 to ensure we ask for an\n            // even-numbered row.\n            // - We subtract 2 to access the previous texel.\n\n            mainLoop += `\n                xCOffset = xC + 1;\n                if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                colIndex}Ready == 0) {\n                  xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n\n                  // Need to manually clear unused channels in case\n                  // we're reading from recycled texture.\n                  if (xCOffset + 1 >= inDims[1]) {\n                    xTexelC${colIndex}.zw = vec2(0.0);\n                  }\n                  xTexelC${colIndex}Ready = 1;\n                }\n              `;\n            // This texel has been read in previous iteration if the dilation\n            // is 1.\n            if (dilationWidth === 1 && colIndex > 0) {\n              mainLoop += `\n                xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${\n                  colIndex}.xy);\n                `;\n            } else {\n              mainLoop += `\n                  xCOffset = xC + 1 - 2;\n\n                  if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                    previous = getX(batch, xR, xCOffset, d1);\n\n                    // Need to manually clear unused channels in case\n                    // we're reading from recycled texture.\n                    if (xCOffset + 1 >= inDims[1]) {\n                      previous.zw = vec2(0.0);\n                    }\n\n                    xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy);\n                  } else {\n                    xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy);\n                  }\n                  `;\n            }\n          } else {\n            // Padding is even, so xRC corresponds to a single texel.\n            mainLoop += `\n                if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                  xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                  if (xC + 1 >= inDims[1]) {\n                    xTexelC${colIndex}.zw = vec2(0.0);\n                  }\n                  xTexelC${colIndex}Ready = 1;\n                }\n\n                xC${colIndex} = xTexelC${colIndex};\n                `;\n          }\n\n          if (colIndex + 1 < filterWidth) {\n            // If dilation is even, the second entry should match the first\n            // (either both are composed or both are single samples). But if\n            // dilation is odd, then the second entry should be the opposite\n            // of the first (if the first is composed, the second is a single\n            // sample, and vice versa.)\n\n            const nextTexelOffset = padLeft % 2 === 0 ?\n                util.nearestLargerEven(dilationWidth) :\n                dilationWidth;\n\n            if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) ||\n                (dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) {\n              mainLoop += `\n                  xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset};\n\n                  if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                  colIndex + 1}Ready == 0) {\n                    xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n\n                    // Need to manually clear unused channels in case\n                    // we're reading from recycled texture.\n                    if (xCOffset + 1 >= inDims[1]) {\n                      xTexelC${colIndex + 1}.zw = vec2(0.0);\n                    }\n                    xTexelC${colIndex + 1}Ready = 1;\n                  }\n                  `;\n\n              // If dilation > 1 then the xRC's will not be able to share any\n              // values, so each xRC will require two unique calls to getX.\n              if (dilationWidth > 1) {\n                mainLoop += `\n                    xCOffset -= 2;\n                    if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                     previous = getX(batch, xR, xCOffset, d1);\n                     xC${colIndex + 1} = vec4(previous.zw, xTexelC${\n                       colIndex + 1}.xy);\n                    } else {\n                     xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${\n                       colIndex + 1}.xy);\n                    }\n                    `;\n              } else {\n                mainLoop += `\n                    xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                    colIndex + 1}.xy);\n                    `;\n              }\n            } else {\n              // If dilation is 1 and padding is odd, we have already read the\n              // texel when constructing the previous x value. Here we can\n              // simply skip the texture read.\n              if (nextTexelOffset === 1) {\n                mainLoop += `\n                    xC${colIndex + 1} = xTexelC${colIndex};\n                    `;\n              } else {\n                mainLoop += `\n                    xCOffset = xC + ${nextTexelOffset};\n\n                    if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                    colIndex + 1}Ready == 0) {\n                      xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                      if (xCOffset + 1 >= inDims[1]) {\n                        xTexelC${colIndex + 1}.zw = vec2(0.0);\n                      }\n                      xTexelC${colIndex + 1}Ready = 1;\n                    }\n\n                    xC${colIndex + 1} = xTexelC${colIndex + 1};\n                    `;\n              }\n            }\n          }\n        }\n      } else {  // stride === 2\n        if (colIndex < filterWidth) {\n          // Depending on whether padLeft is even or odd, we want either the\n          // xy or zw channels from X texels for xC${colIndex}. If padLeft is\n          // even, xC${colIndex +1} is simply the zw channels of texels we've\n          // already sampled. But if padLeft is odd, xC{$c + 1}.zw will\n          // need to come from the xy channels of a new texel, hence the `\n          // vec4\n          // final` initialized below.\n          if (padLeft % 2 === 1) {\n            mainLoop += `\n                xCOffset = xC + 1 - strides[1];\n                if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                colIndex}Ready == 0) {\n                  xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n                  // Need to manually clear unused channels in case\n                  // we're reading from recycled texture.\n                  if (xCOffset + 1 >= inDims[1]) {\n                    xTexelC${colIndex}.zw = vec2(0.0);\n                  }\n                  xTexelC${colIndex}Ready = 1;\n                }\n\n                if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${\n                colIndex + 1}Ready == 0) {\n                  xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1);\n                  // Need to manually clear unused channels in case\n                  // we're reading from recycled texture.\n                  if (xC + 2 >= inDims[1]) {\n                    xTexelC${colIndex + 1}.zw = vec2(0.0);\n                  }\n                  xTexelC${colIndex + 1}Ready = 1;\n                }\n\n                xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                colIndex + 1}.zw);\n              `;\n\n            if (colIndex + 1 < filterWidth) {\n              mainLoop += `\n                  final = vec4(0.0);\n                  xCOffset = xC + 1 + strides[1];\n                  if(xCOffset >= 0 && xCOffset < inDims[1]) {\n                    final = getX(batch, xR, xCOffset, d1);\n                  }\n                  xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy);\n                `;\n            }\n          } else {\n            mainLoop += `\n                if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                  xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                  if (xC + 1 >= inDims[1]) {\n                    xTexelC${colIndex}.zw = vec2(0.0);\n                  }\n                  xTexelC${colIndex}Ready = 1;\n                }\n\n                xCOffset = xC + strides[1];\n                if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                colIndex + 1}Ready == 0) {\n                  xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                  if (xCOffset + 1 >= inDims[1]) {\n                    xTexelC${colIndex + 1}.zw = vec2(0.);\n                  }\n                  xTexelC${colIndex + 1}Ready = 1;\n                }\n\n                xC${colIndex} = vec4(\n                  xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy);\n              `;\n\n            if (colIndex + 1 < filterWidth) {\n              mainLoop += `\n                  xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                  colIndex + 1}.zw);\n                `;\n            }\n          }\n        }\n      }\n\n      // localize the dotProd accumulation within the loop, the theory is for\n      // GPU with limited cache, accumulate sum across large amount of\n      // veriables will cause lots of cache misses. (i.e. 5x5 filter will have\n      // 50 variables)\n      if (colIndex < filterWidth) {\n        mainLoop += `\n            wTexel = getW(r, ${colIndex}, d1, q);\n            dotProd += xC${colIndex} * vec4(wTexel.xz, wTexel.xz);\n          `;\n\n        if (colIndex + 1 < filterWidth) {\n          mainLoop += `\n              wTexel = getW(r, ${colIndex + 1}, d1, q);\n              dotProd += xC${colIndex + 1} * vec4(wTexel.xz, wTexel.xz);\n            `;\n        }\n      }\n    }\n    mainLoop += `\n    }\n  `;\n    mainLoop += `\n      }\n    `;\n\n    let activationSnippet = '', applyActivationSnippet = '';\n    if (activation) {\n      if (hasPreluActivation) {\n        activationSnippet = `vec4 activation(vec4 a) {\n          vec4 b = getPreluActivationWeightsAtOutCoords();\n          ${activation}\n        }`;\n      } else if (hasLeakyReluAlpha) {\n        activationSnippet = `vec4 activation(vec4 a) {\n          vec4 b = getLeakyreluAlphaAtOutCoords();\n          ${activation}\n        }`;\n      } else {\n        activationSnippet = `vec4 activation(vec4 x) {\n          ${activation}\n        }`;\n      }\n\n      applyActivationSnippet = `result = activation(result);`;\n    }\n\n    const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : '';\n    if (addBias) {\n      this.variableNames.push('bias');\n    }\n\n    if (hasPreluActivation) {\n      this.variableNames.push('preluActivationWeights');\n    }\n    if (hasLeakyReluAlpha) {\n      this.variableNames.push('leakyreluAlpha');\n    }\n\n    this.userCode = `\n      ${activationSnippet}\n\n      void main() {\n        ivec4 coords = getOutputCoords();\n        int batch = coords.x;\n        ivec2 xRCCorner = coords.yz * strides - pads;\n        int d2 = coords.w;\n        int d1 = d2 / ${channelMul};\n        int q = d2 - d1 * ${channelMul};\n        int xRCorner = xRCCorner.x;\n        int xCCorner = xRCCorner.y;\n\n        //intialize dotProd with a small epsilon seems to reduce GPU accuracy loss.\n        vec4 dotProd = vec4(0.000000000000001);\n\n        ${mainLoop}\n\n        vec4 result = dotProd - vec4(0.000000000000001);\n        ${addBiasSnippet}\n        ${applyActivationSnippet}\n        setOutput(result);\n      }\n    `;\n  }\n}\n"]}
|