/** * @license * Copyright 2022 Google LLC. All Rights Reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ import { util } from '@tensorflow/tfjs-core'; import { useShapeUniforms } from './gpgpu_math'; export class Conv2DPackedProgram { constructor(convInfo, addBias = false, activation = null, hasPreluActivation = false, hasLeakyReluAlpha = false) { this.variableNames = ['x', 'W']; this.packedInputs = true; this.packedOutput = true; this.customUniforms = [ { name: 'pads', type: 'ivec2' }, { name: 'strides', type: 'ivec2' }, { name: 'dilations', type: 'ivec2' }, { name: 'inDims', type: 'ivec2' }, ]; this.outputShape = convInfo.outShape; this.enableShapeUniforms = useShapeUniforms(this.outputShape.length); const padLeft = convInfo.padInfo.left; const strideWidth = convInfo.strideWidth; const dilationWidth = convInfo.dilationWidth; const filterHeight = convInfo.filterHeight; const filterWidth = convInfo.filterWidth; const texelsAcross = filterWidth; let mainLoop = ` int xR; int xC; int xCOffset; vec4 wTexel; vec4 previous; vec4 final;`; for (let c = 0; c < filterWidth; c++) { mainLoop += ` vec4 xTexelC${c * 2}; int xTexelC${c * 2}Ready; vec4 xTexelC${c * 2 + 1}; int xTexelC${c * 2 + 1}Ready; vec4 xC${c};`; } /** * This vectorized implementation works by gathering the values needed for * each output channel's dot product into vec4's and then multiplying them * all together (this happens in the final double for-loop below). Most of * the main loop consists of constructing these vec4's with the minimum * number of texture2D calls, which means making use of all four returned * values from a texture2D call at once. */ mainLoop += ` for (int r = 0; r < ${filterHeight}; r++) { for (int d1 = 0; d1 < ${convInfo.inChannels}; d1 += 2) { `; for (let c = 0; c < filterWidth; c++) { mainLoop += ` xTexelC${c * 2} = vec4(0.0); xTexelC${c * 2}Ready = 0; xTexelC${c * 2 + 1} = vec4(0.0); xTexelC${c * 2 + 1}Ready = 0; xC${c} = vec4(0.0);`; } mainLoop += ` xR = xRCorner + r * dilations[0]; if (xR >=0 && xR < inDims[0]) { `; for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) { const colIndex = texelC * 2; mainLoop += ` xC = xCCorner + ${colIndex * dilationWidth}; `; if (strideWidth === 1) { if (colIndex < filterWidth) { // If padding is odd, the outer texels have to be composed. if (padLeft % 2 === 1) { // TODO: Ensure vec4 previous does not result in redundant sample, // and avoid setting xTexelRC's that exceed the boundary in the // first place rather than resetting them to vec4(0)). // To compute xCOffset: // - If padding is odd, we must add 1 to ensure we ask for an // even-numbered row. // - We subtract 2 to access the previous texel. mainLoop += ` xCOffset = xC + 1; if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) { xTexelC${colIndex} = getX(batch, xR, xCOffset, d1); // Need to manually clear unused channels in case // we're reading from recycled texture. if (xCOffset + 1 >= inDims[1]) { xTexelC${colIndex}.zw = vec2(0.0); } xTexelC${colIndex}Ready = 1; } `; // This texel has been read in previous iteration if the dilation // is 1. if (dilationWidth === 1 && colIndex > 0) { mainLoop += ` xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${colIndex}.xy); `; } else { mainLoop += ` xCOffset = xC + 1 - 2; if (xCOffset >= 0 && xCOffset < inDims[1]) { previous = getX(batch, xR, xCOffset, d1); // Need to manually clear unused channels in case // we're reading from recycled texture. if (xCOffset + 1 >= inDims[1]) { previous.zw = vec2(0.0); } xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy); } else { xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy); } `; } } else { // Padding is even, so xRC corresponds to a single texel. mainLoop += ` if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) { xTexelC${colIndex} = getX(batch, xR, xC, d1); if (xC + 1 >= inDims[1]) { xTexelC${colIndex}.zw = vec2(0.0); } xTexelC${colIndex}Ready = 1; } xC${colIndex} = xTexelC${colIndex}; `; } if (colIndex + 1 < filterWidth) { // If dilation is even, the second entry should match the first // (either both are composed or both are single samples). But if // dilation is odd, then the second entry should be the opposite // of the first (if the first is composed, the second is a single // sample, and vice versa.) const nextTexelOffset = padLeft % 2 === 0 ? util.nearestLargerEven(dilationWidth) : dilationWidth; if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) || (dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) { mainLoop += ` xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset}; if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) { xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1); // Need to manually clear unused channels in case // we're reading from recycled texture. if (xCOffset + 1 >= inDims[1]) { xTexelC${colIndex + 1}.zw = vec2(0.0); } xTexelC${colIndex + 1}Ready = 1; } `; // If dilation > 1 then the xRC's will not be able to share any // values, so each xRC will require two unique calls to getX. if (dilationWidth > 1) { mainLoop += ` xCOffset -= 2; if (xCOffset >= 0 && xCOffset < inDims[1]) { previous = getX(batch, xR, xCOffset, d1); xC${colIndex + 1} = vec4(previous.zw, xTexelC${colIndex + 1}.xy); } else { xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${colIndex + 1}.xy); } `; } else { mainLoop += ` xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.xy); `; } } else { // If dilation is 1 and padding is odd, we have already read the // texel when constructing the previous x value. Here we can // simply skip the texture read. if (nextTexelOffset === 1) { mainLoop += ` xC${colIndex + 1} = xTexelC${colIndex}; `; } else { mainLoop += ` xCOffset = xC + ${nextTexelOffset}; if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) { xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1); if (xCOffset + 1 >= inDims[1]) { xTexelC${colIndex + 1}.zw = vec2(0.0); } xTexelC${colIndex + 1}Ready = 1; } xC${colIndex + 1} = xTexelC${colIndex + 1}; `; } } } } } else { // stride === 2 if (colIndex < filterWidth) { // Depending on whether padLeft is even or odd, we want either the // xy or zw channels from X texels for xC${colIndex}. If padLeft is // even, xC${colIndex +1} is simply the zw channels of texels we've // already sampled. But if padLeft is odd, xC{$c + 1}.zw will // need to come from the xy channels of a new texel, hence the ` // vec4 // final` initialized below. if (padLeft % 2 === 1) { mainLoop += ` xCOffset = xC + 1 - strides[1]; if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex}Ready == 0) { xTexelC${colIndex} = getX(batch, xR, xCOffset, d1); // Need to manually clear unused channels in case // we're reading from recycled texture. if (xCOffset + 1 >= inDims[1]) { xTexelC${colIndex}.zw = vec2(0.0); } xTexelC${colIndex}Ready = 1; } if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${colIndex + 1}Ready == 0) { xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1); // Need to manually clear unused channels in case // we're reading from recycled texture. if (xC + 2 >= inDims[1]) { xTexelC${colIndex + 1}.zw = vec2(0.0); } xTexelC${colIndex + 1}Ready = 1; } xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw); `; if (colIndex + 1 < filterWidth) { mainLoop += ` final = vec4(0.0); xCOffset = xC + 1 + strides[1]; if(xCOffset >= 0 && xCOffset < inDims[1]) { final = getX(batch, xR, xCOffset, d1); } xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy); `; } } else { mainLoop += ` if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) { xTexelC${colIndex} = getX(batch, xR, xC, d1); if (xC + 1 >= inDims[1]) { xTexelC${colIndex}.zw = vec2(0.0); } xTexelC${colIndex}Ready = 1; } xCOffset = xC + strides[1]; if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${colIndex + 1}Ready == 0) { xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1); if (xCOffset + 1 >= inDims[1]) { xTexelC${colIndex + 1}.zw = vec2(0.); } xTexelC${colIndex + 1}Ready = 1; } xC${colIndex} = vec4( xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy); `; if (colIndex + 1 < filterWidth) { mainLoop += ` xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${colIndex + 1}.zw); `; } } } } // localize the dotProd accumulation within the loop, the theory is for // GPU with limited cache, accumulate sum across large amount of // veriables will cause lots of cache misses. (i.e. 5x5 filter will have // 50 variables) if (colIndex < filterWidth) { mainLoop += ` wTexel = getW(r, ${colIndex}, d1, d2); dotProd += xC${colIndex}.xxzz * vec4(wTexel.xy, wTexel.xy); if(d1 + 1 < ${convInfo.inChannels}) { dotProd += xC${colIndex}.yyww * vec4(wTexel.zw, wTexel.zw); } `; if (colIndex + 1 < filterWidth) { mainLoop += ` wTexel = getW(r, ${colIndex + 1}, d1, d2); dotProd += xC${colIndex + 1}.xxzz * vec4(wTexel.xy, wTexel.xy); if(d1 + 1 < ${convInfo.inChannels}) { dotProd += xC${colIndex + 1}.yyww * vec4(wTexel.zw, wTexel.zw); } `; } } } mainLoop += ` } `; mainLoop += ` } `; mainLoop += ` } `; let activationSnippet = '', applyActivationSnippet = ''; if (activation) { if (hasPreluActivation) { activationSnippet = `vec4 activation(vec4 a) { vec4 b = getPreluActivationWeightsAtOutCoords(); ${activation} }`; } else if (hasLeakyReluAlpha) { activationSnippet = `vec4 activation(vec4 a) { vec4 b = getLeakyreluAlphaAtOutCoords(); ${activation} }`; } else { activationSnippet = `vec4 activation(vec4 x) { ${activation} }`; } applyActivationSnippet = `result = activation(result);`; } const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : ''; if (addBias) { this.variableNames.push('bias'); } if (hasPreluActivation) { this.variableNames.push('preluActivationWeights'); } if (hasLeakyReluAlpha) { this.variableNames.push('leakyreluAlpha'); } this.userCode = ` ${activationSnippet} void main() { ivec4 coords = getOutputCoords(); int batch = coords.x; ivec2 xRCCorner = coords.yz * strides - pads; int d2 = coords.w; int xRCorner = xRCCorner.x; int xCCorner = xRCCorner.y; //intialize dotProd with a small epsilon seems to reduce GPU accuracy loss. vec4 dotProd = vec4(0.000000000000001); ${mainLoop} vec4 result = dotProd - vec4(0.000000000000001); ${addBiasSnippet} ${applyActivationSnippet} setOutput(result); } `; } } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"conv_packed_gpu.js","sourceRoot":"","sources":["../../../../../tfjs-backend-webgl/src/conv_packed_gpu.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEF,OAAO,EAAe,IAAI,EAAC,MAAM,uBAAuB,CAAC;AAEzD,OAAO,EAAe,gBAAgB,EAAC,MAAM,cAAc,CAAC;AAE5D,MAAM,OAAO,mBAAmB;IAc9B,YACI,QAAiC,EAAE,OAAO,GAAG,KAAK,EAClD,aAAqB,IAAI,EAAE,kBAAkB,GAAG,KAAK,EACrD,iBAAiB,GAAG,KAAK;QAhB7B,kBAAa,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;QAC3B,iBAAY,GAAG,IAAI,CAAC;QACpB,iBAAY,GAAG,IAAI,CAAC;QAIpB,mBAAc,GAAG;YACf,EAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,OAAgB,EAAE;YACvC,EAAC,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC1C,EAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,OAAgB,EAAE;YAC5C,EAAC,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAgB,EAAE;SAC1C,CAAC;QAMA,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,QAAQ,CAAC;QACrC,IAAI,CAAC,mBAAmB,GAAG,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;QACrE,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC;QACtC,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,aAAa,CAAC;QAC7C,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC3C,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACzC,MAAM,YAAY,GAAG,WAAW,CAAC;QAEjC,IAAI,QAAQ,GAAG;;+CAE2B,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;yBACM,CAAC,GAAG,CAAC;wBACN,CAAC,GAAG,CAAC;yBACJ,CAAC,GAAG,CAAC,GAAG,CAAC;wBACV,CAAC,GAAG,CAAC,GAAG,CAAC;oBACb,CAAC,GAAG,CAAC;SACnB;QAED;;;;;;;WAOG;QACH,QAAQ,IAAI;2BACU,YAAY;8BACT,QAAQ,CAAC,UAAU;QACzC,CAAC;QACJ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE;YACpC,QAAQ,IAAI;oBACC,CAAC,GAAG,CAAC;oBACL,CAAC,GAAG,CAAC;oBACL,CAAC,GAAG,CAAC,GAAG,CAAC;oBACT,CAAC,GAAG,CAAC,GAAG,CAAC;eACd,CAAC,eAAe,CAAC;SAC1B;QACD,QAAQ,IAAI;;;QAGT,CAAC;QAEJ,KAAK,IAAI,MAAM,GAAG,CAAC,EAAE,MAAM,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,MAAM,EAAE,EAAE;YAC9D,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;YAE5B,QAAQ,IAAI;6BACU,QAAQ,GAAG,aAAa;YACzC,CAAC;YAEN,IAAI,WAAW,KAAK,CAAC,EAAE;gBACrB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,2DAA2D;oBAC3D,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,kEAAkE;wBAClE,+DAA+D;wBAC/D,sDAAsD;wBAEtD,uBAAuB;wBACvB,6DAA6D;wBAC7D,qBAAqB;wBACrB,gDAAgD;wBAEhD,QAAQ,IAAI;;uEAGR,QAAQ;4BACG,QAAQ;;;;;8BAKN,QAAQ;;4BAEV,QAAQ;;gBAEpB,CAAC;wBACJ,iEAAiE;wBACjE,QAAQ;wBACR,IAAI,aAAa,KAAK,CAAC,IAAI,QAAQ,GAAG,CAAC,EAAE;4BACvC,QAAQ,IAAI;qBACN,QAAQ,kBAAkB,QAAQ,GAAG,CAAC,eACxC,QAAQ;kBACT,CAAC;yBACL;6BAAM;4BACL,QAAQ,IAAI;;;;;;;;;;;;yBAYF,QAAQ,+BAA+B,QAAQ;;yBAE/C,QAAQ,4BAA4B,QAAQ;;oBAEjD,CAAC;yBACP;qBACF;yBAAM;wBACL,yDAAyD;wBACzD,QAAQ,IAAI;2DACkC,QAAQ;4BACvC,QAAQ;;8BAEN,QAAQ;;4BAEV,QAAQ;;;qBAGf,QAAQ,aAAa,QAAQ;kBAChC,CAAC;qBACP;oBAED,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;wBAC9B,+DAA+D;wBAC/D,gEAAgE;wBAChE,gEAAgE;wBAChE,iEAAiE;wBACjE,2BAA2B;wBAE3B,MAAM,eAAe,GAAG,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;4BACvC,IAAI,CAAC,iBAAiB,CAAC,aAAa,CAAC,CAAC,CAAC;4BACvC,aAAa,CAAC;wBAElB,IAAI,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC;4BAC9C,CAAC,aAAa,GAAG,CAAC,KAAK,CAAC,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE;4BAClD,QAAQ,IAAI;wDAC6B,eAAe;;yEAGpD,QAAQ,GAAG,CAAC;8BACD,QAAQ,GAAG,CAAC;;;;;gCAKV,QAAQ,GAAG,CAAC;;8BAEd,QAAQ,GAAG,CAAC;;oBAEtB,CAAC;4BAEN,+DAA+D;4BAC/D,6DAA6D;4BAC7D,IAAI,aAAa,GAAG,CAAC,EAAE;gCACrB,QAAQ,IAAI;;;;0BAIH,QAAQ,GAAG,CAAC,+BACd,QAAQ,GAAG,CAAC;;0BAEV,QAAQ,GAAG,CAAC,4BACd,QAAQ,GAAG,CAAC;;sBAEd,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;yBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;sBACX,CAAC;6BACP;yBAEF;6BAAM;4BACL,gEAAgE;4BAChE,4DAA4D;4BAC5D,gCAAgC;4BAChC,IAAI,eAAe,KAAK,CAAC,EAAE;gCACzB,QAAQ,IAAI;yBACJ,QAAQ,GAAG,CAAC,aAAa,QAAQ;sBACpC,CAAC;6BACP;iCAAM;gCACL,QAAQ,IAAI;uCACU,eAAe;;2EAGjC,QAAQ,GAAG,CAAC;gCACD,QAAQ,GAAG,CAAC;;kCAEV,QAAQ,GAAG,CAAC;;gCAEd,QAAQ,GAAG,CAAC;;;yBAGnB,QAAQ,GAAG,CAAC,aAAa,QAAQ,GAAG,CAAC;sBACxC,CAAC;6BACP;yBACF;qBACF;iBACF;aACF;iBAAM,EAAG,eAAe;gBACvB,IAAI,QAAQ,GAAG,WAAW,EAAE;oBAC1B,kEAAkE;oBAClE,mEAAmE;oBACnE,mEAAmE;oBACnE,6DAA6D;oBAC7D,gEAAgE;oBAChE,OAAO;oBACP,4BAA4B;oBAC5B,IAAI,OAAO,GAAG,CAAC,KAAK,CAAC,EAAE;wBACrB,QAAQ,IAAI;;sEAGR,QAAQ;4BACG,QAAQ;;;;8BAIN,QAAQ;;4BAEV,QAAQ;;;kEAInB,QAAQ,GAAG,CAAC;4BACD,QAAQ,GAAG,CAAC;;;;8BAIV,QAAQ,GAAG,CAAC;;4BAEd,QAAQ,GAAG,CAAC;;;qBAGnB,QAAQ,kBAAkB,QAAQ,eACtC,QAAQ,GAAG,CAAC;gBACb,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;;;;;;uBAMJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,GAAG,CAAC;kBAC/C,CAAC;yBACL;qBACF;yBAAM;wBACL,QAAQ,IAAI;0DACiC,QAAQ;4BACtC,QAAQ;;8BAEN,QAAQ;;4BAEV,QAAQ;;;;sEAKnB,QAAQ,GAAG,CAAC;4BACD,QAAQ,GAAG,CAAC;;8BAEV,QAAQ,GAAG,CAAC;;4BAEd,QAAQ,GAAG,CAAC;;;qBAGnB,QAAQ;4BACD,QAAQ,eAAe,QAAQ,GAAG,CAAC;gBAC/C,CAAC;wBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;4BAC9B,QAAQ,IAAI;uBACJ,QAAQ,GAAG,CAAC,kBAAkB,QAAQ,eAC1C,QAAQ,GAAG,CAAC;kBACb,CAAC;yBACL;qBACF;iBACF;aACF;YAED,uEAAuE;YACvE,gEAAgE;YAChE,wEAAwE;YACxE,gBAAgB;YAChB,IAAI,QAAQ,GAAG,WAAW,EAAE;gBAC1B,QAAQ,IAAI;gCACW,QAAQ;4BACZ,QAAQ;2BACT,QAAQ,CAAC,UAAU;8BAChB,QAAQ;;YAE1B,CAAC;gBAEJ,IAAI,QAAQ,GAAG,CAAC,GAAG,WAAW,EAAE;oBAC9B,QAAQ,IAAI;kCACW,QAAQ,GAAG,CAAC;8BAChB,QAAQ,GAAG,CAAC;6BACb,QAAQ,CAAC,UAAU;gCAChB,QAAQ,GAAG,CAAC;;cAE9B,CAAC;iBACL;aACF;SACF;QACD,QAAQ,IAAI;;IAEb,CAAC;QACF,QAAQ,IAAI;;IAEX,CAAC;QACF,QAAQ,IAAI;;IAEX,CAAC;QAEA,IAAI,iBAAiB,GAAG,EAAE,EAAE,sBAAsB,GAAG,EAAE,CAAC;QACxD,IAAI,UAAU,EAAE;YACd,IAAI,kBAAkB,EAAE;gBACtB,iBAAiB,GAAG;;aAEhB,UAAU;WACZ,CAAC;aACJ;iBAAM,IAAI,iBAAiB,EAAE;gBAC5B,iBAAiB,GAAG;;aAEhB,UAAU;WACZ,CAAC;aACJ;iBAAM;gBACL,iBAAiB,GAAG;aAChB,UAAU;WACZ,CAAC;aACJ;YAED,sBAAsB,GAAG,8BAA8B,CAAC;SACzD;QAED,MAAM,cAAc,GAAG,OAAO,CAAC,CAAC,CAAC,iCAAiC,CAAC,CAAC,CAAC,EAAE,CAAC;QACxE,IAAI,OAAO,EAAE;YACX,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;SACjC;QAED,IAAI,kBAAkB,EAAE;YACtB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;SACnD;QACD,IAAI,iBAAiB,EAAE;YACrB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;SAC3C;QAED,IAAI,CAAC,QAAQ,GAAG;SACZ,iBAAiB;;;;;;;;;;;;;WAaf,QAAQ;;;WAGR,cAAc;WACd,sBAAsB;;;MAG3B,CAAC;IACJ,CAAC;CACF","sourcesContent":["/**\n * @license\n * Copyright 2022 Google LLC. All Rights Reserved.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n import {backend_util, util} from '@tensorflow/tfjs-core';\n\n import {GPGPUProgram, useShapeUniforms} from './gpgpu_math';\n\n export class Conv2DPackedProgram implements GPGPUProgram {\n   variableNames = ['x', 'W'];\n   packedInputs = true;\n   packedOutput = true;\n   outputShape: number[];\n   userCode: string;\n   enableShapeUniforms: boolean;\n   customUniforms = [\n     {name: 'pads', type: 'ivec2' as const },\n     {name: 'strides', type: 'ivec2' as const },\n     {name: 'dilations', type: 'ivec2' as const },\n     {name: 'inDims', type: 'ivec2' as const },\n   ];\n\n   constructor(\n       convInfo: backend_util.Conv2DInfo, addBias = false,\n       activation: string = null, hasPreluActivation = false,\n       hasLeakyReluAlpha = false) {\n     this.outputShape = convInfo.outShape;\n     this.enableShapeUniforms = useShapeUniforms(this.outputShape.length);\n     const padLeft = convInfo.padInfo.left;\n     const strideWidth = convInfo.strideWidth;\n     const dilationWidth = convInfo.dilationWidth;\n     const filterHeight = convInfo.filterHeight;\n     const filterWidth = convInfo.filterWidth;\n     const texelsAcross = filterWidth;\n\n     let mainLoop = `\n       int xR; int xC; int xCOffset;\n       vec4 wTexel; vec4 previous; vec4 final;`;\n\n     for (let c = 0; c < filterWidth; c++) {\n       mainLoop += `\n           vec4 xTexelC${c * 2};\n           int xTexelC${c * 2}Ready;\n           vec4 xTexelC${c * 2 + 1};\n           int xTexelC${c * 2 + 1}Ready;\n           vec4 xC${c};`;\n     }\n\n     /**\n      * This vectorized implementation works by gathering the values needed for\n      * each output channel's dot product into vec4's and then multiplying them\n      * all together (this happens in the final double for-loop below). Most of\n      * the main loop consists of constructing these vec4's with the minimum\n      * number of texture2D calls, which means making use of all four returned\n      * values from a texture2D call at once.\n      */\n     mainLoop += `\n     for (int r = 0; r < ${filterHeight}; r++) {\n      for (int d1 = 0; d1 < ${convInfo.inChannels}; d1 += 2) {\n       `;\n     for (let c = 0; c < filterWidth; c++) {\n       mainLoop += `\n           xTexelC${c * 2} = vec4(0.0);\n           xTexelC${c * 2}Ready = 0;\n           xTexelC${c * 2 + 1} = vec4(0.0);\n           xTexelC${c * 2 + 1}Ready = 0;\n           xC${c} = vec4(0.0);`;\n     }\n     mainLoop += `\n         xR = xRCorner + r * dilations[0];\n         if (xR >=0 && xR < inDims[0]) {\n       `;\n\n     for (let texelC = 0; texelC < (texelsAcross + 1) / 2; texelC++) {\n       const colIndex = texelC * 2;\n\n       mainLoop += `\n           xC = xCCorner + ${colIndex * dilationWidth};\n           `;\n\n       if (strideWidth === 1) {\n         if (colIndex < filterWidth) {\n           // If padding is odd, the outer texels have to be composed.\n           if (padLeft % 2 === 1) {\n             // TODO: Ensure vec4 previous does not result in redundant sample,\n             // and avoid setting xTexelRC's that exceed the boundary in the\n             // first place rather than resetting them to vec4(0)).\n\n             // To compute xCOffset:\n             // - If padding is odd, we must add 1 to ensure we ask for an\n             // even-numbered row.\n             // - We subtract 2 to access the previous texel.\n\n             mainLoop += `\n                 xCOffset = xC + 1;\n                 if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n               `;\n             // This texel has been read in previous iteration if the dilation\n             // is 1.\n             if (dilationWidth === 1 && colIndex > 0) {\n               mainLoop += `\n                 xC${colIndex} = vec4(xTexelC${colIndex - 2}.zw, xTexelC${\n                   colIndex}.xy);\n                 `;\n             } else {\n               mainLoop += `\n                   xCOffset = xC + 1 - 2;\n\n                   if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                     previous = getX(batch, xR, xCOffset, d1);\n\n                     // Need to manually clear unused channels in case\n                     // we're reading from recycled texture.\n                     if (xCOffset + 1 >= inDims[1]) {\n                       previous.zw = vec2(0.0);\n                     }\n\n                     xC${colIndex} = vec4(previous.zw, xTexelC${colIndex}.xy);\n                   } else {\n                     xC${colIndex} = vec4(0.0, 0.0, xTexelC${colIndex}.xy);\n                   }\n                   `;\n             }\n           } else {\n             // Padding is even, so xRC corresponds to a single texel.\n             mainLoop += `\n                 if (xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                   if (xC + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 xC${colIndex} = xTexelC${colIndex};\n                 `;\n           }\n\n           if (colIndex + 1 < filterWidth) {\n             // If dilation is even, the second entry should match the first\n             // (either both are composed or both are single samples). But if\n             // dilation is odd, then the second entry should be the opposite\n             // of the first (if the first is composed, the second is a single\n             // sample, and vice versa.)\n\n             const nextTexelOffset = padLeft % 2 === 0 ?\n                 util.nearestLargerEven(dilationWidth) :\n                 dilationWidth;\n\n             if ((dilationWidth % 2 === 0 && padLeft % 2 === 1) ||\n                 (dilationWidth % 2 !== 0 && padLeft % 2 !== 1)) {\n               mainLoop += `\n                   xCOffset = xC + imod(pads[1], 2) + ${nextTexelOffset};\n\n                   if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                   colIndex + 1}Ready == 0) {\n                     xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n\n                     // Need to manually clear unused channels in case\n                     // we're reading from recycled texture.\n                     if (xCOffset + 1 >= inDims[1]) {\n                       xTexelC${colIndex + 1}.zw = vec2(0.0);\n                     }\n                     xTexelC${colIndex + 1}Ready = 1;\n                   }\n                   `;\n\n               // If dilation > 1 then the xRC's will not be able to share any\n               // values, so each xRC will require two unique calls to getX.\n               if (dilationWidth > 1) {\n                 mainLoop += `\n                     xCOffset -= 2;\n                     if (xCOffset >= 0 && xCOffset < inDims[1]) {\n                      previous = getX(batch, xR, xCOffset, d1);\n                      xC${colIndex + 1} = vec4(previous.zw, xTexelC${\n                        colIndex + 1}.xy);\n                     } else {\n                      xC${colIndex + 1} = vec4(0.0, 0.0, xTexelC${\n                        colIndex + 1}.xy);\n                     }\n                     `;\n               } else {\n                 mainLoop += `\n                     xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                     colIndex + 1}.xy);\n                     `;\n               }\n\n             } else {\n               // If dilation is 1 and padding is odd, we have already read the\n               // texel when constructing the previous x value. Here we can\n               // simply skip the texture read.\n               if (nextTexelOffset === 1) {\n                 mainLoop += `\n                     xC${colIndex + 1} = xTexelC${colIndex};\n                     `;\n               } else {\n                 mainLoop += `\n                     xCOffset = xC + ${nextTexelOffset};\n\n                     if (xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                     colIndex + 1}Ready == 0) {\n                       xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                       if (xCOffset + 1 >= inDims[1]) {\n                         xTexelC${colIndex + 1}.zw = vec2(0.0);\n                       }\n                       xTexelC${colIndex + 1}Ready = 1;\n                     }\n\n                     xC${colIndex + 1} = xTexelC${colIndex + 1};\n                     `;\n               }\n             }\n           }\n         }\n       } else {  // stride === 2\n         if (colIndex < filterWidth) {\n           // Depending on whether padLeft is even or odd, we want either the\n           // xy or zw channels from X texels for xC${colIndex}. If padLeft is\n           // even, xC${colIndex +1} is simply the zw channels of texels we've\n           // already sampled. But if padLeft is odd, xC{$c + 1}.zw will\n           // need to come from the xy channels of a new texel, hence the `\n           // vec4\n           // final` initialized below.\n           if (padLeft % 2 === 1) {\n             mainLoop += `\n                 xCOffset = xC + 1 - strides[1];\n                 if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xCOffset, d1);\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 if(xC + 1 >= 0 && xC + 1 < inDims[1] && xTexelC${\n                 colIndex + 1}Ready == 0) {\n                   xTexelC${colIndex + 1} = getX(batch, xR, xC + 1, d1);\n                   // Need to manually clear unused channels in case\n                   // we're reading from recycled texture.\n                   if (xC + 2 >= inDims[1]) {\n                     xTexelC${colIndex + 1}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex + 1}Ready = 1;\n                 }\n\n                 xC${colIndex} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                 colIndex + 1}.zw);\n               `;\n\n             if (colIndex + 1 < filterWidth) {\n               mainLoop += `\n                   final = vec4(0.0);\n                   xCOffset = xC + 1 + strides[1];\n                   if(xCOffset >= 0 && xCOffset < inDims[1]) {\n                     final = getX(batch, xR, xCOffset, d1);\n                   }\n                   xC${colIndex + 1} = vec4(xTexelC${colIndex + 1}.xy, final.xy);\n                 `;\n             }\n           } else {\n             mainLoop += `\n                 if(xC >= 0 && xC < inDims[1] && xTexelC${colIndex}Ready == 0) {\n                   xTexelC${colIndex} = getX(batch, xR, xC, d1);\n                   if (xC + 1 >= inDims[1]) {\n                     xTexelC${colIndex}.zw = vec2(0.0);\n                   }\n                   xTexelC${colIndex}Ready = 1;\n                 }\n\n                 xCOffset = xC + strides[1];\n                 if(xCOffset >= 0 && xCOffset < inDims[1] && xTexelC${\n                 colIndex + 1}Ready == 0) {\n                   xTexelC${colIndex + 1} = getX(batch, xR, xCOffset, d1);\n                   if (xCOffset + 1 >= inDims[1]) {\n                     xTexelC${colIndex + 1}.zw = vec2(0.);\n                   }\n                   xTexelC${colIndex + 1}Ready = 1;\n                 }\n\n                 xC${colIndex} = vec4(\n                   xTexelC${colIndex}.xy, xTexelC${colIndex + 1}.xy);\n               `;\n\n             if (colIndex + 1 < filterWidth) {\n               mainLoop += `\n                   xC${colIndex + 1} = vec4(xTexelC${colIndex}.zw, xTexelC${\n                   colIndex + 1}.zw);\n                 `;\n             }\n           }\n         }\n       }\n\n       // localize the dotProd accumulation within the loop, the theory is for\n       // GPU with limited cache, accumulate sum across large amount of\n       // veriables will cause lots of cache misses. (i.e. 5x5 filter will have\n       // 50 variables)\n       if (colIndex < filterWidth) {\n         mainLoop += `\n             wTexel = getW(r, ${colIndex}, d1, d2);\n             dotProd += xC${colIndex}.xxzz * vec4(wTexel.xy, wTexel.xy);\n             if(d1 + 1 < ${convInfo.inChannels}) {\n               dotProd += xC${colIndex}.yyww * vec4(wTexel.zw, wTexel.zw);\n             }\n           `;\n\n         if (colIndex + 1 < filterWidth) {\n           mainLoop += `\n               wTexel = getW(r, ${colIndex + 1}, d1, d2);\n               dotProd += xC${colIndex + 1}.xxzz * vec4(wTexel.xy, wTexel.xy);\n               if(d1 + 1 < ${convInfo.inChannels}) {\n                 dotProd += xC${colIndex + 1}.yyww * vec4(wTexel.zw, wTexel.zw);\n               }\n             `;\n         }\n       }\n     }\n     mainLoop += `\n     }\n   `;\n   mainLoop += `\n     }\n   `;\n   mainLoop += `\n     }\n   `;\n\n     let activationSnippet = '', applyActivationSnippet = '';\n     if (activation) {\n       if (hasPreluActivation) {\n         activationSnippet = `vec4 activation(vec4 a) {\n           vec4 b = getPreluActivationWeightsAtOutCoords();\n           ${activation}\n         }`;\n       } else if (hasLeakyReluAlpha) {\n         activationSnippet = `vec4 activation(vec4 a) {\n           vec4 b = getLeakyreluAlphaAtOutCoords();\n           ${activation}\n         }`;\n       } else {\n         activationSnippet = `vec4 activation(vec4 x) {\n           ${activation}\n         }`;\n       }\n\n       applyActivationSnippet = `result = activation(result);`;\n     }\n\n     const addBiasSnippet = addBias ? 'result += getBiasAtOutCoords();' : '';\n     if (addBias) {\n       this.variableNames.push('bias');\n     }\n\n     if (hasPreluActivation) {\n       this.variableNames.push('preluActivationWeights');\n     }\n     if (hasLeakyReluAlpha) {\n       this.variableNames.push('leakyreluAlpha');\n     }\n\n     this.userCode = `\n       ${activationSnippet}\n\n       void main() {\n         ivec4 coords = getOutputCoords();\n         int batch = coords.x;\n         ivec2 xRCCorner = coords.yz * strides - pads;\n         int d2 = coords.w;\n         int xRCorner = xRCCorner.x;\n         int xCCorner = xRCCorner.y;\n\n         //intialize dotProd with a small epsilon seems to reduce GPU accuracy loss.\n         vec4 dotProd = vec4(0.000000000000001);\n\n         ${mainLoop}\n\n         vec4 result = dotProd - vec4(0.000000000000001);\n         ${addBiasSnippet}\n         ${applyActivationSnippet}\n         setOutput(result);\n       }\n     `;\n   }\n }\n"]}