From 76330361ae1345a4b622c2712b6b96ba606de4a4 Mon Sep 17 00:00:00 2001
From: Rye Mutt <rye@alchemyviewer.org>
Date: Thu, 26 Dec 2024 04:31:40 -0500
Subject: [PATCH] Fix various crashes in CAS shader load

---
 .../shaders/class1/alchemy/CASF.glsl          | 1249 +----------------
 1 file changed, 30 insertions(+), 1219 deletions(-)

diff --git a/indra/newview/app_settings/shaders/class1/alchemy/CASF.glsl b/indra/newview/app_settings/shaders/class1/alchemy/CASF.glsl
index d4c794c7fe7..41b92fe3b9a 100644
--- a/indra/newview/app_settings/shaders/class1/alchemy/CASF.glsl
+++ b/indra/newview/app_settings/shaders/class1/alchemy/CASF.glsl
@@ -225,7 +225,7 @@ uniform uvec4 cas_param_1;
 //------------------------------------------------------------------------------------------------------------------------------
 // TODO
 // ====
-//  - Replace transcendentals with manual versions. 
+//  - Replace transcendentals with manual versions.
 //==============================================================================================================================
  #ifdef A_GCC
   A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
@@ -286,7 +286,7 @@ uniform uvec4 cas_param_1;
  A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
  A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
 //------------------------------------------------------------------------------------------------------------------------------
- // These follow the convention that A integer types don't have signage, until they are operated on. 
+ // These follow the convention that A integer types don't have signage, until they are operated on.
  A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
  A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
 //------------------------------------------------------------------------------------------------------------------------------
@@ -533,11 +533,6 @@ uniform uvec4 cas_param_1;
 //==============================================================================================================================
 #if defined(A_GLSL) && defined(A_GPU)
  #ifndef A_SKIP_EXT
-  #ifdef A_HALF
-   #extension GL_EXT_shader_16bit_storage:require
-   #extension GL_EXT_shader_explicit_arithmetic_types:require 
-  #endif
-//------------------------------------------------------------------------------------------------------------------------------
   #ifdef A_LONG
    #extension GL_ARB_gpu_shader_int64:require
    #extension GL_NV_shader_atomic_int64:require
@@ -580,17 +575,6 @@ uniform uvec4 cas_param_1;
  #define AU2_AF2(x) floatBitsToUint(AF2(x))
  #define AU3_AF3(x) floatBitsToUint(AF3(x))
  #define AU4_AF4(x) floatBitsToUint(AF4(x))
-//------------------------------------------------------------------------------------------------------------------------------
- AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
- #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
-//------------------------------------------------------------------------------------------------------------------------------
- #define AU1_AH2_AF2 packHalf2x16
- #define AU1_AW2Unorm_AF2 packUnorm2x16
- #define AU1_AB4Unorm_AF4 packUnorm4x8
-//------------------------------------------------------------------------------------------------------------------------------
- #define AF2_AH2_AU1 unpackHalf2x16
- #define AF2_AW2Unorm_AU1 unpackUnorm2x16
- #define AF4_AB4Unorm_AU1 unpackUnorm4x8
 //==============================================================================================================================
  AF1 AF1_x(AF1 a){return AF1(a);}
  AF2 AF2_x(AF1 a){return AF2(a,a);}
@@ -749,130 +733,6 @@ uniform uvec4 cas_param_1;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
 //==============================================================================================================================
-//                                                          GLSL HALF
-//==============================================================================================================================
- #ifdef A_HALF
-  #define AH1 float16_t
-  #define AH2 f16vec2
-  #define AH3 f16vec3
-  #define AH4 f16vec4
-//------------------------------------------------------------------------------------------------------------------------------
-  #define AW1 uint16_t
-  #define AW2 u16vec2
-  #define AW3 u16vec3
-  #define AW4 u16vec4
-//------------------------------------------------------------------------------------------------------------------------------
-  #define ASW1 int16_t
-  #define ASW2 i16vec2
-  #define ASW3 i16vec3
-  #define ASW4 i16vec4
-//==============================================================================================================================
-  #define AH2_AU1(x) unpackFloat2x16(AU1(x))
-  AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
-  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
-  #define AW2_AU1(x) unpackUint2x16(AU1(x))
-  #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
-//------------------------------------------------------------------------------------------------------------------------------
-  #define AU1_AH2(x) packFloat2x16(AH2(x))
-  AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
-  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
-  #define AU1_AW2(x) packUint2x16(AW2(x))
-  #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
-//==============================================================================================================================
-  #define AW1_AH1(x) halfBitsToUint16(AH1(x))
-  #define AW2_AH2(x) halfBitsToUint16(AH2(x))
-  #define AW3_AH3(x) halfBitsToUint16(AH3(x))
-  #define AW4_AH4(x) halfBitsToUint16(AH4(x))
-//------------------------------------------------------------------------------------------------------------------------------
-  #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
-  #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
-  #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
-  #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
-//==============================================================================================================================
-  AH1 AH1_x(AH1 a){return AH1(a);}
-  AH2 AH2_x(AH1 a){return AH2(a,a);}
-  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
-  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
-  #define AH1_(a) AH1_x(AH1(a))
-  #define AH2_(a) AH2_x(AH1(a))
-  #define AH3_(a) AH3_x(AH1(a))
-  #define AH4_(a) AH4_x(AH1(a))
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AW1_x(AW1 a){return AW1(a);}
-  AW2 AW2_x(AW1 a){return AW2(a,a);}
-  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
-  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
-  #define AW1_(a) AW1_x(AW1(a))
-  #define AW2_(a) AW2_x(AW1(a))
-  #define AW3_(a) AW3_x(AW1(a))
-  #define AW4_(a) AW4_x(AW1(a))
-//==============================================================================================================================
-  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
-  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
-  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
-  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
-  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
-  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
-  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AFractH1(AH1 x){return fract(x);}
-  AH2 AFractH2(AH2 x){return fract(x);}
-  AH3 AFractH3(AH3 x){return fract(x);}
-  AH4 AFractH4(AH4 x){return fract(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
-  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
-  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
-  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
-//------------------------------------------------------------------------------------------------------------------------------
-  // No packed version of max3.
-  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
-  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
-  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
-  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
-  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
-  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
-  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // No packed version of min3.
-  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
-  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
-  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
-  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
-  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
-  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
-  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
-  AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
-  AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
-  AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
-  AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
-  AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
-  AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
-  AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
-  AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
-  AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
-  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
-  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
-  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
- #endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//_____________________________________________________________/\_______________________________________________________________
-//==============================================================================================================================
 //                                                         GLSL DOUBLE
 //==============================================================================================================================
  #ifdef A_DUBL
@@ -975,13 +835,6 @@ uniform uvec4 cas_param_1;
   AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
   AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
   AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  #ifdef A_HALF
-   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
-   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
-   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
-   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
-  #endif
  #endif
 //==============================================================================================================================
 #endif
@@ -1056,7 +909,7 @@ uniform uvec4 cas_param_1;
  #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
 //------------------------------------------------------------------------------------------------------------------------------
  AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
- #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
+ #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a))
  #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
 //------------------------------------------------------------------------------------------------------------------------------
  AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
@@ -1190,168 +1043,6 @@ uniform uvec4 cas_param_1;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
 //==============================================================================================================================
-//                                                          HLSL HALF
-//==============================================================================================================================
- #ifdef A_HALF
-  #ifdef A_HLSL_6_2
-   #define AH1 float16_t
-   #define AH2 float16_t2
-   #define AH3 float16_t3
-   #define AH4 float16_t4
-//------------------------------------------------------------------------------------------------------------------------------
-   #define AW1 uint16_t
-   #define AW2 uint16_t2
-   #define AW3 uint16_t3
-   #define AW4 uint16_t4
-//------------------------------------------------------------------------------------------------------------------------------
-   #define ASW1 int16_t
-   #define ASW2 int16_t2
-   #define ASW3 int16_t3
-   #define ASW4 int16_t4
-  #else
-   #define AH1 min16float
-   #define AH2 min16float2
-   #define AH3 min16float3
-   #define AH4 min16float4
-//------------------------------------------------------------------------------------------------------------------------------
-   #define AW1 min16uint
-   #define AW2 min16uint2
-   #define AW3 min16uint3
-   #define AW4 min16uint4
-//------------------------------------------------------------------------------------------------------------------------------
-   #define ASW1 min16int
-   #define ASW2 min16int2
-   #define ASW3 min16int3
-   #define ASW4 min16int4
-  #endif
-//==============================================================================================================================
-  // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
-  // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
-  AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
-  AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
-  AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
-  AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
-  #define AH2_AU1(x) AH2_AU1_x(AU1(x))
-  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
-  #define AW2_AU1(x) AW2_AU1_x(AU1(x))
-  #define AW4_AU2(x) AW4_AU2_x(AU2(x))
-//------------------------------------------------------------------------------------------------------------------------------
-  AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
-  AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
-  AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
-  AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
-  #define AU1_AH2(x) AU1_AH2_x(AH2(x))
-  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
-  #define AU1_AW2(x) AU1_AW2_x(AW2(x))
-  #define AU2_AW4(x) AU2_AW4_x(AW4(x))
-//==============================================================================================================================
-  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
-   #define AW1_AH1(x) asuint16(x)
-   #define AW2_AH2(x) asuint16(x)
-   #define AW3_AH3(x) asuint16(x)
-   #define AW4_AH4(x) asuint16(x)
-  #else
-   #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
-   #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
-   #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
-   #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
-  #endif
-//------------------------------------------------------------------------------------------------------------------------------
-  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
-   #define AH1_AW1(x) asfloat16(x)
-   #define AH2_AW2(x) asfloat16(x)
-   #define AH3_AW3(x) asfloat16(x)
-   #define AH4_AW4(x) asfloat16(x)
-  #else
-   #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
-   #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
-   #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
-   #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
-  #endif
-//==============================================================================================================================
-  AH1 AH1_x(AH1 a){return AH1(a);}
-  AH2 AH2_x(AH1 a){return AH2(a,a);}
-  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
-  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
-  #define AH1_(a) AH1_x(AH1(a))
-  #define AH2_(a) AH2_x(AH1(a))
-  #define AH3_(a) AH3_x(AH1(a))
-  #define AH4_(a) AH4_x(AH1(a))
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AW1_x(AW1 a){return AW1(a);}
-  AW2 AW2_x(AW1 a){return AW2(a,a);}
-  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
-  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
-  #define AW1_(a) AW1_x(AW1(a))
-  #define AW2_(a) AW2_x(AW1(a))
-  #define AW3_(a) AW3_x(AW1(a))
-  #define AW4_(a) AW4_x(AW1(a))
-//==============================================================================================================================
-  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
-  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
-  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
-  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
-  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
-  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
-  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
-//------------------------------------------------------------------------------------------------------------------------------
- // V_FRACT_F16 (note DX frac() is different).
-  AH1 AFractH1(AH1 x){return x-floor(x);}
-  AH2 AFractH2(AH2 x){return x-floor(x);}
-  AH3 AFractH3(AH3 x){return x-floor(x);}
-  AH4 AFractH4(AH4 x){return x-floor(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
-  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
-  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
-  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
-  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
-  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
-  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
-  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
-  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
-  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
-  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
-  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
-  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
-  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
-  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
-  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ARcpH1(AH1 x){return rcp(x);}
-  AH2 ARcpH2(AH2 x){return rcp(x);}
-  AH3 ARcpH3(AH3 x){return rcp(x);}
-  AH4 ARcpH4(AH4 x){return rcp(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ARsqH1(AH1 x){return rsqrt(x);}
-  AH2 ARsqH2(AH2 x){return rsqrt(x);}
-  AH3 ARsqH3(AH3 x){return rsqrt(x);}
-  AH4 ARsqH4(AH4 x){return rsqrt(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ASatH1(AH1 x){return saturate(x);}
-  AH2 ASatH2(AH2 x){return saturate(x);}
-  AH3 ASatH3(AH3 x){return saturate(x);}
-  AH4 ASatH4(AH4 x){return saturate(x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
-  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
-  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
-  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
- #endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//_____________________________________________________________/\_______________________________________________________________
-//==============================================================================================================================
 //                                                         HLSL DOUBLE
 //==============================================================================================================================
  #ifdef A_DUBL
@@ -1414,13 +1105,6 @@ uniform uvec4 cas_param_1;
   AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
   AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
   AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
-//------------------------------------------------------------------------------------------------------------------------------
-  #ifdef A_HALF
-   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
-   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
-   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
-   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
-  #endif
  #endif
 //==============================================================================================================================
 #endif
@@ -1462,36 +1146,10 @@ uniform uvec4 cas_param_1;
  AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
  AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
 //------------------------------------------------------------------------------------------------------------------------------
- AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
+// #2744 avoid constant overflow  AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
  AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
  AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
  AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
-//==============================================================================================================================
- #ifdef A_HALF
-  #ifdef A_HLSL_6_2
-   #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
-   #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
-  #else
-   #define A_INFP_H AH1_AW1(0x7c00u)
-   #define A_INFN_H AH1_AW1(0xfc00u)
-  #endif
-
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
-  AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
-  AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
-  AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
-  AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
-  AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
-  AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
-  AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
-  AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
-  AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -1515,49 +1173,6 @@ uniform uvec4 cas_param_1;
  // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
  AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
  AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
-//------------------------------------------------------------------------------------------------------------------------------
- #ifdef A_HALF
-  AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
-  AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
-  AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
- #endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//_____________________________________________________________/\_______________________________________________________________
-//==============================================================================================================================
-//                                                      [PERM] V_PERM_B32
-//------------------------------------------------------------------------------------------------------------------------------
-// Support for V_PERM_B32 started in the 3rd generation of GCN.
-//------------------------------------------------------------------------------------------------------------------------------
-// yyyyxxxx - The 'i' input.
-// 76543210
-// ========
-// HGFEDCBA - Naming on permutation.
-//------------------------------------------------------------------------------------------------------------------------------
-// TODO
-// ====
-//  - Make sure compiler optimizes this.
-//==============================================================================================================================
- #ifdef A_HALF
-  AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
-  AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
-  AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
-  AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
-  AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
-  AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
-  AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
-  AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
-  AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
-  AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
-  AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
-  AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -1573,7 +1188,7 @@ uniform uvec4 cas_param_1;
 //  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
 //  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
 // V_PERM_B32 does byte packing with ability to zero fill bytes as well.
-//  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
+//  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo.
 //------------------------------------------------------------------------------------------------------------------------------
 // BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
 // ====   =====
@@ -1634,28 +1249,6 @@ uniform uvec4 cas_param_1;
   AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
   AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
  #endif
-//==============================================================================================================================
- #ifdef A_HALF
-  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
-  AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
-   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Designed for 3 ops to do SOA to AOS and conversion.
-  AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
-   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
-  AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
-   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
-  AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
-   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
-  AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
-   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Designed for 2 ops to do both AOS to SOA, and conversion.
-  AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
-  AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
-  AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
-  AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -1668,8 +1261,8 @@ uniform uvec4 cas_param_1;
 // ENCODING (without zero-based encoding)
 // ========
 //   0 = unused (can be used to mean something else)
-//   1 = lowest value 
-// 128 = exact zero center (zero based encoding 
+//   1 = lowest value
+// 128 = exact zero center (zero based encoding
 // 255 = highest value
 //------------------------------------------------------------------------------------------------------------------------------
 // Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
@@ -1681,8 +1274,8 @@ uniform uvec4 cas_param_1;
 //    1 : -126/512
 //    2 : -125/512
 //     ...
-//  128 : 0 
-//     ... 
+//  128 : 0
+//     ...
 //  255 : 127/512
 //      : 1/4 (just outside the encoding range)
 //==============================================================================================================================
@@ -1711,83 +1304,6 @@ uniform uvec4 cas_param_1;
   AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
   AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
  #endif
-//==============================================================================================================================
- #ifdef A_HALF
-  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
-  AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
-   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
-   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
-  AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
-   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
-  AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
-   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
-  AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
-   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
-   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
-  AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
-   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
-  AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
-   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
-  AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
-   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
-  AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
- #endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//_____________________________________________________________/\_______________________________________________________________
-//==============================================================================================================================
-//                                                     HALF APPROXIMATIONS
-//------------------------------------------------------------------------------------------------------------------------------
-// These support only positive inputs.
-// Did not see value yet in specialization for range.
-// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
-// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
-// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
-// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
-//------------------------------------------------------------------------------------------------------------------------------
-// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
-// Same with sqrt(), as this could be x*rsq() (7 ops).
-//==============================================================================================================================
- #ifdef A_HALF
-  // Minimize squared error across full positive range, 2 ops.
-  // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
-  AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
-  AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
-  AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
-  AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Lower precision estimation, 1 op.
-  // Minimize squared error across {smallest normal to 16384.0}.
-  AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
-  AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
-  AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
-  AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Medium precision estimation, one Newton Raphson iteration, 3 ops.
-  AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
-  AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
-  AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
-  AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
-  AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
-  AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
-  AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
-  AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -1896,17 +1412,6 @@ uniform uvec4 cas_param_1;
   AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
   AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
  #endif
-//------------------------------------------------------------------------------------------------------------------------------
- #ifdef A_HALF
-  // For a packed {sin,cos} pair,
-  //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
-  //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
-  AH1 APSinH1(AH1 x){return x*abs(x)-x;}
-  AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
-  AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
-  AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
-  AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -2007,75 +1512,6 @@ uniform uvec4 cas_param_1;
   AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
   AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
  #endif
-//==============================================================================================================================
- #ifdef A_HALF
-  AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
-  AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
-  AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
-  AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
-  AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
-  AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
-  AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
-  AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
-  AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
-  AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
-//==============================================================================================================================
-  // Uses denormal trick.
-  AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
-  AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
-  AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
-  AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
-//------------------------------------------------------------------------------------------------------------------------------
-  // AMD arch lacks a packed conversion opcode.
-  AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
-  AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
-  AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
-  AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
-//==============================================================================================================================
-  AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
-  AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
-  AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
-  AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
-  AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
-  AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
-  AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
-  AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
-  AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
-  AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
-  AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
-  AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
-  AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
-  AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
-  AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
-  AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
-  AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
-  AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
-  AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
-  AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
-  AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
-  AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
-  AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
-  AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
-  AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -2137,9 +1573,9 @@ uniform uvec4 cas_param_1;
    return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
 //------------------------------------------------------------------------------------------------------------------------------
   // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
-  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
-  AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
-  AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
+  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));}
+  AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));}
+  AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));}
 //------------------------------------------------------------------------------------------------------------------------------
   AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
    return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
@@ -2173,9 +1609,9 @@ uniform uvec4 cas_param_1;
   AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
    return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
 //------------------------------------------------------------------------------------------------------------------------------
-  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
-  AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
-  AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
+  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));}
+  AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));}
+  AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));}
 //------------------------------------------------------------------------------------------------------------------------------
   AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
    return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
@@ -2201,61 +1637,6 @@ uniform uvec4 cas_param_1;
   AF3 AFromThreeF3(AF3 c){return c*c*c;}
  #endif
 //==============================================================================================================================
- #ifdef A_HALF
-  AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
-   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
-  AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
-   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
-  AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
-   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
-  AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
-  AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
-   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
-  AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
-   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
-  AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
-   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AToTwoH1(AH1 c){return sqrt(c);}
-  AH2 AToTwoH2(AH2 c){return sqrt(c);}
-  AH3 AToTwoH3(AH3 c){return sqrt(c);}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
-  AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
-  AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
- #endif
-//==============================================================================================================================
- #ifdef A_HALF
-  AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
-   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
-  AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
-   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
-  AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
-   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
-  AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
-  AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
-   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
-  AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
-   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
-  AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
-   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AFromTwoH1(AH1 c){return c*c;}
-  AH2 AFromTwoH2(AH2 c){return c*c;}
-  AH3 AFromTwoH3(AH3 c){return c*c;}
-//------------------------------------------------------------------------------------------------------------------------------
-  AH1 AFromThreeH1(AH1 c){return c*c*c;}
-  AH2 AFromThreeH2(AH2 c){return c*c*c;}
-  AH3 AFromThreeH3(AH3 c){return c*c*c;}
- #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -2277,20 +1658,16 @@ uniform uvec4 cas_param_1;
  // Details,
  //  LANE TO 8x8 MAPPING
  //  ===================
- //  00 01 08 09 10 11 18 19 
+ //  00 01 08 09 10 11 18 19
  //  02 03 0a 0b 12 13 1a 1b
  //  04 05 0c 0d 14 15 1c 1d
- //  06 07 0e 0f 16 17 1e 1f 
- //  20 21 28 29 30 31 38 39 
+ //  06 07 0e 0f 16 17 1e 1f
+ //  20 21 28 29 30 31 38 39
  //  22 23 2a 2b 32 33 3a 3b
  //  24 25 2c 2d 34 35 3c 3d
- //  26 27 2e 2f 36 37 3e 3f 
+ //  26 27 2e 2f 36 37 3e 3f
  AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
 //==============================================================================================================================
- #ifdef A_HALF
-  AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
-  AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
- #endif
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -2341,7 +1718,7 @@ uniform uvec4 cas_param_1;
 //   ...
 //  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
 //  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
-//  2047 .............................................. last normal value that still maps to integers 
+//  2047 .............................................. last normal value that still maps to integers
 // Scaling limits,
 //  2^15 = 32768 ...................................... largest power of 2 scaling
 // Largest pow2 conversion mapping is at *32768,
@@ -2653,6 +2030,7 @@ uniform uvec4 cas_param_1;
 AP1 CasSupportScaling(AF1 outX,AF1 outY,AF1 inX,AF1 inY){return ((outX*outY)*ARcpF1(inX*inY))<=CAS_AREA_LIMIT;}
 //==============================================================================================================================
 // Call to setup required constant values (works on CPU or GPU).
+#ifndef A_GPU
 A_STATIC void CasSetup(
  outAU4 const0,
  outAU4 const1,
@@ -2673,6 +2051,8 @@ A_STATIC void CasSetup(
   const1[1]=AU1_AH2_AF2(hSharp);
   const1[2]=AU1_AF1(AF1_(8.0)*inputSizeInPixelsX*ARcpF1(outputSizeInPixelsX));
   const1[3]=0;}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //_____________________________________________________________/\_______________________________________________________________
@@ -2680,14 +2060,10 @@ A_STATIC void CasSetup(
 //                                                     NON-PACKED VERSION
 //==============================================================================================================================
 #ifdef A_GPU
- #ifdef CAS_PACKED_ONLY
-  // Avoid compiler error.
-  AF3 CasLoad(ASU2 p){return AF3(0.0,0.0,0.0);}
-  void CasInput(inout AF1 r,inout AF1 g,inout AF1 b){}
- #endif
-
  AF3 CasLoad(ASU2 p) { return texelFetch(diffuseRect, p, 0).rgb; }
- void CasInput(inout AF1 r,inout AF1 g,inout AF1 b) {}
+ void CasInput(inout AF1 r,inout AF1 g,inout AF1 b)
+ {
+ }
 
 //------------------------------------------------------------------------------------------------------------------------------
  void CasFilter(
@@ -2703,11 +2079,11 @@ A_STATIC void CasSetup(
   #ifdef CAS_DEBUG_CHECKER
    if((((ip.x^ip.y)>>8u)&1u)==0u){AF3 pix0=CasLoad(ASU2(ip));
     pixR=pix0.r;pixG=pix0.g;pixB=pix0.b;CasInput(pixR,pixG,pixB);return;}
-  #endif 
+  #endif
 //------------------------------------------------------------------------------------------------------------------------------
   // No scaling algorithm uses minimal 3x3 pixel neighborhood.
   if(noScaling){
-   // a b c 
+   // a b c
    // d e f
    // g h i
    ASU2 sp=ASU2(ip);
@@ -3101,7 +2477,7 @@ A_STATIC void CasSetup(
   //  i j k l
   //    n o
   //  _____  _____  _____  _____
-  //         fs        gt 
+  //         fs        gt
   //
   //  _____  _____  _____  _____
   //  fs      s gt  fs  t     gt
@@ -3163,571 +2539,6 @@ A_STATIC void CasSetup(
   #endif
  }
 #endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//_____________________________________________________________/\_______________________________________________________________
-//==============================================================================================================================
-//                                                       PACKED VERSION
-//==============================================================================================================================
-#if defined(A_GPU) && defined(A_HALF)
- // Missing a way to do packed re-interpetation, so must disable approximation optimizations.
- #ifdef A_HLSL
-  #ifndef CAS_GO_SLOWER
-   #define CAS_GO_SLOWER 1
-  #endif
- #endif
-//==============================================================================================================================
- // Can be used to convert from packed SOA to AOS for store.
- void CasDepack(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
-  #ifdef A_HLSL
-   // Invoke a slower path for DX only, since it won't allow uninitialized values.
-   pix0.a=pix1.a=0.0;
-  #endif
-  pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
-  pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
-//==============================================================================================================================
- void CasFilterH(
- // Output values are for 2 8x8 tiles in a 16x8 region.
- //  pix<R,G,B>.x = right 8x8 tile
- //  pix<R,G,B>.y =  left 8x8 tile
- // This enables later processing to easily be packed as well.
- out AH2 pixR,
- out AH2 pixG,
- out AH2 pixB,
- AU2 ip, // Integer pixel position in output.
- AU4 const0, // Constants generated by CasSetup().
- AU4 const1,
- AP1 noScaling){ // Must be a compile-time literal value, true = sharpen only (no resize).
-//------------------------------------------------------------------------------------------------------------------------------
-  // Debug a checker pattern of on/off tiles for visual inspection.
-  #ifdef CAS_DEBUG_CHECKER
-   if((((ip.x^ip.y)>>8u)&1u)==0u){AH3 pix0=CasLoadH(ASW2(ip));AH3 pix1=CasLoadH(ASW2(ip)+ASW2(8,0));
-    pixR=AH2(pix0.r,pix1.r);pixG=AH2(pix0.g,pix1.g);pixB=AH2(pix0.b,pix1.b);CasInputH(pixR,pixG,pixB);return;}
-  #endif 
-//------------------------------------------------------------------------------------------------------------------------------
-  // No scaling algorithm uses minimal 3x3 pixel neighborhood.
-  if(noScaling){
-   ASW2 sp0=ASW2(ip);
-   AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
-   AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
-   AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
-   AH3 d0=CasLoadH(sp0+ASW2(-1, 0));
-   AH3 e0=CasLoadH(sp0);
-   AH3 f0=CasLoadH(sp0+ASW2( 1, 0));
-   AH3 g0=CasLoadH(sp0+ASW2(-1, 1));
-   AH3 h0=CasLoadH(sp0+ASW2( 0, 1));
-   AH3 i0=CasLoadH(sp0+ASW2( 1, 1));
-   ASW2 sp1=sp0+ASW2(8,0);
-   AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
-   AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
-   AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
-   AH3 d1=CasLoadH(sp1+ASW2(-1, 0));
-   AH3 e1=CasLoadH(sp1);
-   AH3 f1=CasLoadH(sp1+ASW2( 1, 0));
-   AH3 g1=CasLoadH(sp1+ASW2(-1, 1));
-   AH3 h1=CasLoadH(sp1+ASW2( 0, 1));
-   AH3 i1=CasLoadH(sp1+ASW2( 1, 1));
-   // AOS to SOA conversion.
-   AH2 aR=AH2(a0.r,a1.r);
-   AH2 aG=AH2(a0.g,a1.g);
-   AH2 aB=AH2(a0.b,a1.b);
-   AH2 bR=AH2(b0.r,b1.r);
-   AH2 bG=AH2(b0.g,b1.g);
-   AH2 bB=AH2(b0.b,b1.b);
-   AH2 cR=AH2(c0.r,c1.r);
-   AH2 cG=AH2(c0.g,c1.g);
-   AH2 cB=AH2(c0.b,c1.b);
-   AH2 dR=AH2(d0.r,d1.r);
-   AH2 dG=AH2(d0.g,d1.g);
-   AH2 dB=AH2(d0.b,d1.b);
-   AH2 eR=AH2(e0.r,e1.r);
-   AH2 eG=AH2(e0.g,e1.g);
-   AH2 eB=AH2(e0.b,e1.b);
-   AH2 fR=AH2(f0.r,f1.r);
-   AH2 fG=AH2(f0.g,f1.g);
-   AH2 fB=AH2(f0.b,f1.b);
-   AH2 gR=AH2(g0.r,g1.r);
-   AH2 gG=AH2(g0.g,g1.g);
-   AH2 gB=AH2(g0.b,g1.b);
-   AH2 hR=AH2(h0.r,h1.r);
-   AH2 hG=AH2(h0.g,h1.g);
-   AH2 hB=AH2(h0.b,h1.b);
-   AH2 iR=AH2(i0.r,i1.r);
-   AH2 iG=AH2(i0.g,i1.g);
-   AH2 iB=AH2(i0.b,i1.b);
-   // Run optional input transform.
-   CasInputH(aR,aG,aB);
-   CasInputH(bR,bG,bB);
-   CasInputH(cR,cG,cB);
-   CasInputH(dR,dG,dB);
-   CasInputH(eR,eG,eB);
-   CasInputH(fR,fG,fB);
-   CasInputH(gR,gG,gB);
-   CasInputH(hR,hG,hB);
-   CasInputH(iR,iG,iB);
-   // Soft min and max.
-   AH2 mnR=min(min(fR,hR),min(min(bR,dR),eR));
-   AH2 mnG=min(min(fG,hG),min(min(bG,dG),eG));
-   AH2 mnB=min(min(fB,hB),min(min(bB,dB),eB));
-   #ifdef CAS_BETTER_DIAGONALS
-    AH2 mnR2=min(min(gR,iR),min(min(aR,cR),mnR));
-    AH2 mnG2=min(min(gG,iG),min(min(aG,cG),mnG));
-    AH2 mnB2=min(min(gB,iB),min(min(aB,cB),mnB));
-    mnR=mnR+mnR2;
-    mnG=mnG+mnG2;
-    mnB=mnB+mnB2;
-   #endif
-   AH2 mxR=max(max(fR,hR),max(max(bR,dR),eR));
-   AH2 mxG=max(max(fG,hG),max(max(bG,dG),eG));
-   AH2 mxB=max(max(fB,hB),max(max(bB,dB),eB));
-   #ifdef CAS_BETTER_DIAGONALS
-    AH2 mxR2=max(max(gR,iR),max(max(aR,cR),mxR));
-    AH2 mxG2=max(max(gG,iG),max(max(aG,cG),mxG));
-    AH2 mxB2=max(max(gB,iB),max(max(aB,cB),mxB));
-    mxR=mxR+mxR2;
-    mxG=mxG+mxG2;
-    mxB=mxB+mxB2;
-   #endif
-   // Smooth minimum distance to signal limit divided by smooth max.
-   #ifdef CAS_GO_SLOWER
-    AH2 rcpMR=ARcpH2(mxR);
-    AH2 rcpMG=ARcpH2(mxG);
-    AH2 rcpMB=ARcpH2(mxB);
-   #else
-    AH2 rcpMR=APrxLoRcpH2(mxR);
-    AH2 rcpMG=APrxLoRcpH2(mxG);
-    AH2 rcpMB=APrxLoRcpH2(mxB);
-   #endif
-   #ifdef CAS_BETTER_DIAGONALS
-    AH2 ampR=ASatH2(min(mnR,AH2_(2.0)-mxR)*rcpMR);
-    AH2 ampG=ASatH2(min(mnG,AH2_(2.0)-mxG)*rcpMG);
-    AH2 ampB=ASatH2(min(mnB,AH2_(2.0)-mxB)*rcpMB);
-   #else
-    AH2 ampR=ASatH2(min(mnR,AH2_(1.0)-mxR)*rcpMR);
-    AH2 ampG=ASatH2(min(mnG,AH2_(1.0)-mxG)*rcpMG);
-    AH2 ampB=ASatH2(min(mnB,AH2_(1.0)-mxB)*rcpMB);
-   #endif
-   // Shaping amount of sharpening.
-   #ifdef CAS_GO_SLOWER
-    ampR=sqrt(ampR);
-    ampG=sqrt(ampG);
-    ampB=sqrt(ampB);
-   #else
-    ampR=APrxLoSqrtH2(ampR);
-    ampG=APrxLoSqrtH2(ampG);
-    ampB=APrxLoSqrtH2(ampB);
-   #endif
-   // Filter shape.
-   AH1 peak=AH2_AU1(const1.y).x;
-   AH2 wR=ampR*AH2_(peak);
-   AH2 wG=ampG*AH2_(peak);
-   AH2 wB=ampB*AH2_(peak);
-   // Filter.
-   #ifndef CAS_SLOW
-    #ifdef CAS_GO_SLOWER
-     AH2 rcpWeight=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
-    #else
-     AH2 rcpWeight=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
-    #endif
-    pixR=ASatH2((bR*wG+dR*wG+fR*wG+hR*wG+eR)*rcpWeight);
-    pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeight);
-    pixB=ASatH2((bB*wG+dB*wG+fB*wG+hB*wG+eB)*rcpWeight);
-   #else
-    #ifdef CAS_GO_SLOWER
-     AH2 rcpWeightR=ARcpH2(AH2_(1.0)+AH2_(4.0)*wR);
-     AH2 rcpWeightG=ARcpH2(AH2_(1.0)+AH2_(4.0)*wG);
-     AH2 rcpWeightB=ARcpH2(AH2_(1.0)+AH2_(4.0)*wB);
-    #else
-     AH2 rcpWeightR=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wR);
-     AH2 rcpWeightG=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wG);
-     AH2 rcpWeightB=APrxMedRcpH2(AH2_(1.0)+AH2_(4.0)*wB);
-    #endif
-    pixR=ASatH2((bR*wR+dR*wR+fR*wR+hR*wR+eR)*rcpWeightR);
-    pixG=ASatH2((bG*wG+dG*wG+fG*wG+hG*wG+eG)*rcpWeightG);
-    pixB=ASatH2((bB*wB+dB*wB+fB*wB+hB*wB+eB)*rcpWeightB);
-   #endif
-   return;}
-//------------------------------------------------------------------------------------------------------------------------------
-  // Scaling algorithm adaptively interpolates between nearest 4 results of the non-scaling algorithm.
-  AF2 pp=AF2(ip)*AF2_AU2(const0.xy)+AF2_AU2(const0.zw);
-  // Tile 0.
-  // Fractional position is needed in high precision here.
-  AF2 fp0=floor(pp);
-  AH2 ppX;
-  ppX.x=AH1(pp.x-fp0.x);
-  AH1 ppY=AH1(pp.y-fp0.y);
-  ASW2 sp0=ASW2(fp0);
-  AH3 a0=CasLoadH(sp0+ASW2(-1,-1));
-  AH3 b0=CasLoadH(sp0+ASW2( 0,-1));
-  AH3 e0=CasLoadH(sp0+ASW2(-1, 0));
-  AH3 f0=CasLoadH(sp0);
-  AH3 c0=CasLoadH(sp0+ASW2( 1,-1));
-  AH3 d0=CasLoadH(sp0+ASW2( 2,-1));
-  AH3 g0=CasLoadH(sp0+ASW2( 1, 0));
-  AH3 h0=CasLoadH(sp0+ASW2( 2, 0));
-  AH3 i0=CasLoadH(sp0+ASW2(-1, 1));
-  AH3 j0=CasLoadH(sp0+ASW2( 0, 1));
-  AH3 m0=CasLoadH(sp0+ASW2(-1, 2));
-  AH3 n0=CasLoadH(sp0+ASW2( 0, 2));
-  AH3 k0=CasLoadH(sp0+ASW2( 1, 1));
-  AH3 l0=CasLoadH(sp0+ASW2( 2, 1));
-  AH3 o0=CasLoadH(sp0+ASW2( 1, 2));
-  AH3 p0=CasLoadH(sp0+ASW2( 2, 2));
-  // Tile 1 (offset only in x).
-  AF1 pp1=pp.x+AF1_AU1(const1.z);
-  AF1 fp1=floor(pp1);
-  ppX.y=AH1(pp1-fp1);
-  ASW2 sp1=ASW2(fp1,sp0.y);
-  AH3 a1=CasLoadH(sp1+ASW2(-1,-1));
-  AH3 b1=CasLoadH(sp1+ASW2( 0,-1));
-  AH3 e1=CasLoadH(sp1+ASW2(-1, 0));
-  AH3 f1=CasLoadH(sp1);
-  AH3 c1=CasLoadH(sp1+ASW2( 1,-1));
-  AH3 d1=CasLoadH(sp1+ASW2( 2,-1));
-  AH3 g1=CasLoadH(sp1+ASW2( 1, 0));
-  AH3 h1=CasLoadH(sp1+ASW2( 2, 0));
-  AH3 i1=CasLoadH(sp1+ASW2(-1, 1));
-  AH3 j1=CasLoadH(sp1+ASW2( 0, 1));
-  AH3 m1=CasLoadH(sp1+ASW2(-1, 2));
-  AH3 n1=CasLoadH(sp1+ASW2( 0, 2));
-  AH3 k1=CasLoadH(sp1+ASW2( 1, 1));
-  AH3 l1=CasLoadH(sp1+ASW2( 2, 1));
-  AH3 o1=CasLoadH(sp1+ASW2( 1, 2));
-  AH3 p1=CasLoadH(sp1+ASW2( 2, 2));
-  // AOS to SOA conversion.
-  AH2 aR=AH2(a0.r,a1.r);
-  AH2 aG=AH2(a0.g,a1.g);
-  AH2 aB=AH2(a0.b,a1.b);
-  AH2 bR=AH2(b0.r,b1.r);
-  AH2 bG=AH2(b0.g,b1.g);
-  AH2 bB=AH2(b0.b,b1.b);
-  AH2 cR=AH2(c0.r,c1.r);
-  AH2 cG=AH2(c0.g,c1.g);
-  AH2 cB=AH2(c0.b,c1.b);
-  AH2 dR=AH2(d0.r,d1.r);
-  AH2 dG=AH2(d0.g,d1.g);
-  AH2 dB=AH2(d0.b,d1.b);
-  AH2 eR=AH2(e0.r,e1.r);
-  AH2 eG=AH2(e0.g,e1.g);
-  AH2 eB=AH2(e0.b,e1.b);
-  AH2 fR=AH2(f0.r,f1.r);
-  AH2 fG=AH2(f0.g,f1.g);
-  AH2 fB=AH2(f0.b,f1.b);
-  AH2 gR=AH2(g0.r,g1.r);
-  AH2 gG=AH2(g0.g,g1.g);
-  AH2 gB=AH2(g0.b,g1.b);
-  AH2 hR=AH2(h0.r,h1.r);
-  AH2 hG=AH2(h0.g,h1.g);
-  AH2 hB=AH2(h0.b,h1.b);
-  AH2 iR=AH2(i0.r,i1.r);
-  AH2 iG=AH2(i0.g,i1.g);
-  AH2 iB=AH2(i0.b,i1.b);
-  AH2 jR=AH2(j0.r,j1.r);
-  AH2 jG=AH2(j0.g,j1.g);
-  AH2 jB=AH2(j0.b,j1.b);
-  AH2 kR=AH2(k0.r,k1.r);
-  AH2 kG=AH2(k0.g,k1.g);
-  AH2 kB=AH2(k0.b,k1.b);
-  AH2 lR=AH2(l0.r,l1.r);
-  AH2 lG=AH2(l0.g,l1.g);
-  AH2 lB=AH2(l0.b,l1.b);
-  AH2 mR=AH2(m0.r,m1.r);
-  AH2 mG=AH2(m0.g,m1.g);
-  AH2 mB=AH2(m0.b,m1.b);
-  AH2 nR=AH2(n0.r,n1.r);
-  AH2 nG=AH2(n0.g,n1.g);
-  AH2 nB=AH2(n0.b,n1.b);
-  AH2 oR=AH2(o0.r,o1.r);
-  AH2 oG=AH2(o0.g,o1.g);
-  AH2 oB=AH2(o0.b,o1.b);
-  AH2 pR=AH2(p0.r,p1.r);
-  AH2 pG=AH2(p0.g,p1.g);
-  AH2 pB=AH2(p0.b,p1.b);
-  // Run optional input transform.
-  CasInputH(aR,aG,aB);
-  CasInputH(bR,bG,bB);
-  CasInputH(cR,cG,cB);
-  CasInputH(dR,dG,dB);
-  CasInputH(eR,eG,eB);
-  CasInputH(fR,fG,fB);
-  CasInputH(gR,gG,gB);
-  CasInputH(hR,hG,hB);
-  CasInputH(iR,iG,iB);
-  CasInputH(jR,jG,jB);
-  CasInputH(kR,kG,kB);
-  CasInputH(lR,lG,lB);
-  CasInputH(mR,mG,mB);
-  CasInputH(nR,nG,nB);
-  CasInputH(oR,oG,oB);
-  CasInputH(pR,pG,pB);
-  // Soft min and max.
-  // These are 2.0x bigger (factored out the extra multiply).
-  //  a b c             b
-  //  e f g * 0.5  +  e f g * 0.5  [F]
-  //  i j k             j
-  AH2 mnfR=AMin3H2(AMin3H2(bR,eR,fR),gR,jR);
-  AH2 mnfG=AMin3H2(AMin3H2(bG,eG,fG),gG,jG);
-  AH2 mnfB=AMin3H2(AMin3H2(bB,eB,fB),gB,jB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mnfR2=AMin3H2(AMin3H2(mnfR,aR,cR),iR,kR);
-   AH2 mnfG2=AMin3H2(AMin3H2(mnfG,aG,cG),iG,kG);
-   AH2 mnfB2=AMin3H2(AMin3H2(mnfB,aB,cB),iB,kB);
-   mnfR=mnfR+mnfR2;
-   mnfG=mnfG+mnfG2;
-   mnfB=mnfB+mnfB2;
-  #endif
-  AH2 mxfR=AMax3H2(AMax3H2(bR,eR,fR),gR,jR);
-  AH2 mxfG=AMax3H2(AMax3H2(bG,eG,fG),gG,jG);
-  AH2 mxfB=AMax3H2(AMax3H2(bB,eB,fB),gB,jB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mxfR2=AMax3H2(AMax3H2(mxfR,aR,cR),iR,kR);
-   AH2 mxfG2=AMax3H2(AMax3H2(mxfG,aG,cG),iG,kG);
-   AH2 mxfB2=AMax3H2(AMax3H2(mxfB,aB,cB),iB,kB);
-   mxfR=mxfR+mxfR2;
-   mxfG=mxfG+mxfG2;
-   mxfB=mxfB+mxfB2;
-  #endif
-  //  b c d             c
-  //  f g h * 0.5  +  f g h * 0.5  [G]
-  //  j k l             k
-  AH2 mngR=AMin3H2(AMin3H2(cR,fR,gR),hR,kR);
-  AH2 mngG=AMin3H2(AMin3H2(cG,fG,gG),hG,kG);
-  AH2 mngB=AMin3H2(AMin3H2(cB,fB,gB),hB,kB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mngR2=AMin3H2(AMin3H2(mngR,bR,dR),jR,lR);
-   AH2 mngG2=AMin3H2(AMin3H2(mngG,bG,dG),jG,lG);
-   AH2 mngB2=AMin3H2(AMin3H2(mngB,bB,dB),jB,lB);
-   mngR=mngR+mngR2;
-   mngG=mngG+mngG2;
-   mngB=mngB+mngB2;
-  #endif
-  AH2 mxgR=AMax3H2(AMax3H2(cR,fR,gR),hR,kR);
-  AH2 mxgG=AMax3H2(AMax3H2(cG,fG,gG),hG,kG);
-  AH2 mxgB=AMax3H2(AMax3H2(cB,fB,gB),hB,kB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mxgR2=AMax3H2(AMax3H2(mxgR,bR,dR),jR,lR);
-   AH2 mxgG2=AMax3H2(AMax3H2(mxgG,bG,dG),jG,lG);
-   AH2 mxgB2=AMax3H2(AMax3H2(mxgB,bB,dB),jB,lB);
-   mxgR=mxgR+mxgR2;
-   mxgG=mxgG+mxgG2;
-   mxgB=mxgB+mxgB2;
-  #endif
-  //  e f g             f
-  //  i j k * 0.5  +  i j k * 0.5  [J]
-  //  m n o             n
-  AH2 mnjR=AMin3H2(AMin3H2(fR,iR,jR),kR,nR);
-  AH2 mnjG=AMin3H2(AMin3H2(fG,iG,jG),kG,nG);
-  AH2 mnjB=AMin3H2(AMin3H2(fB,iB,jB),kB,nB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mnjR2=AMin3H2(AMin3H2(mnjR,eR,gR),mR,oR);
-   AH2 mnjG2=AMin3H2(AMin3H2(mnjG,eG,gG),mG,oG);
-   AH2 mnjB2=AMin3H2(AMin3H2(mnjB,eB,gB),mB,oB);
-   mnjR=mnjR+mnjR2;
-   mnjG=mnjG+mnjG2;
-   mnjB=mnjB+mnjB2;
-  #endif
-  AH2 mxjR=AMax3H2(AMax3H2(fR,iR,jR),kR,nR);
-  AH2 mxjG=AMax3H2(AMax3H2(fG,iG,jG),kG,nG);
-  AH2 mxjB=AMax3H2(AMax3H2(fB,iB,jB),kB,nB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mxjR2=AMax3H2(AMax3H2(mxjR,eR,gR),mR,oR);
-   AH2 mxjG2=AMax3H2(AMax3H2(mxjG,eG,gG),mG,oG);
-   AH2 mxjB2=AMax3H2(AMax3H2(mxjB,eB,gB),mB,oB);
-   mxjR=mxjR+mxjR2;
-   mxjG=mxjG+mxjG2;
-   mxjB=mxjB+mxjB2;
-  #endif
-  //  f g h             g
-  //  j k l * 0.5  +  j k l * 0.5  [K]
-  //  n o p             o
-  AH2 mnkR=AMin3H2(AMin3H2(gR,jR,kR),lR,oR);
-  AH2 mnkG=AMin3H2(AMin3H2(gG,jG,kG),lG,oG);
-  AH2 mnkB=AMin3H2(AMin3H2(gB,jB,kB),lB,oB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mnkR2=AMin3H2(AMin3H2(mnkR,fR,hR),nR,pR);
-   AH2 mnkG2=AMin3H2(AMin3H2(mnkG,fG,hG),nG,pG);
-   AH2 mnkB2=AMin3H2(AMin3H2(mnkB,fB,hB),nB,pB);
-   mnkR=mnkR+mnkR2;
-   mnkG=mnkG+mnkG2;
-   mnkB=mnkB+mnkB2;
-  #endif
-  AH2 mxkR=AMax3H2(AMax3H2(gR,jR,kR),lR,oR);
-  AH2 mxkG=AMax3H2(AMax3H2(gG,jG,kG),lG,oG);
-  AH2 mxkB=AMax3H2(AMax3H2(gB,jB,kB),lB,oB);
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 mxkR2=AMax3H2(AMax3H2(mxkR,fR,hR),nR,pR);
-   AH2 mxkG2=AMax3H2(AMax3H2(mxkG,fG,hG),nG,pG);
-   AH2 mxkB2=AMax3H2(AMax3H2(mxkB,fB,hB),nB,pB);
-   mxkR=mxkR+mxkR2;
-   mxkG=mxkG+mxkG2;
-   mxkB=mxkB+mxkB2;
-  #endif
-  // Smooth minimum distance to signal limit divided by smooth max.
-  #ifdef CAS_GO_SLOWER
-   AH2 rcpMfR=ARcpH2(mxfR);
-   AH2 rcpMfG=ARcpH2(mxfG);
-   AH2 rcpMfB=ARcpH2(mxfB);
-   AH2 rcpMgR=ARcpH2(mxgR);
-   AH2 rcpMgG=ARcpH2(mxgG);
-   AH2 rcpMgB=ARcpH2(mxgB);
-   AH2 rcpMjR=ARcpH2(mxjR);
-   AH2 rcpMjG=ARcpH2(mxjG);
-   AH2 rcpMjB=ARcpH2(mxjB);
-   AH2 rcpMkR=ARcpH2(mxkR);
-   AH2 rcpMkG=ARcpH2(mxkG);
-   AH2 rcpMkB=ARcpH2(mxkB);
-  #else
-   AH2 rcpMfR=APrxLoRcpH2(mxfR);
-   AH2 rcpMfG=APrxLoRcpH2(mxfG);
-   AH2 rcpMfB=APrxLoRcpH2(mxfB);
-   AH2 rcpMgR=APrxLoRcpH2(mxgR);
-   AH2 rcpMgG=APrxLoRcpH2(mxgG);
-   AH2 rcpMgB=APrxLoRcpH2(mxgB);
-   AH2 rcpMjR=APrxLoRcpH2(mxjR);
-   AH2 rcpMjG=APrxLoRcpH2(mxjG);
-   AH2 rcpMjB=APrxLoRcpH2(mxjB);
-   AH2 rcpMkR=APrxLoRcpH2(mxkR);
-   AH2 rcpMkG=APrxLoRcpH2(mxkG);
-   AH2 rcpMkB=APrxLoRcpH2(mxkB);
-  #endif
-  #ifdef CAS_BETTER_DIAGONALS
-   AH2 ampfR=ASatH2(min(mnfR,AH2_(2.0)-mxfR)*rcpMfR);
-   AH2 ampfG=ASatH2(min(mnfG,AH2_(2.0)-mxfG)*rcpMfG);
-   AH2 ampfB=ASatH2(min(mnfB,AH2_(2.0)-mxfB)*rcpMfB);
-   AH2 ampgR=ASatH2(min(mngR,AH2_(2.0)-mxgR)*rcpMgR);
-   AH2 ampgG=ASatH2(min(mngG,AH2_(2.0)-mxgG)*rcpMgG);
-   AH2 ampgB=ASatH2(min(mngB,AH2_(2.0)-mxgB)*rcpMgB);
-   AH2 ampjR=ASatH2(min(mnjR,AH2_(2.0)-mxjR)*rcpMjR);
-   AH2 ampjG=ASatH2(min(mnjG,AH2_(2.0)-mxjG)*rcpMjG);
-   AH2 ampjB=ASatH2(min(mnjB,AH2_(2.0)-mxjB)*rcpMjB);
-   AH2 ampkR=ASatH2(min(mnkR,AH2_(2.0)-mxkR)*rcpMkR);
-   AH2 ampkG=ASatH2(min(mnkG,AH2_(2.0)-mxkG)*rcpMkG);
-   AH2 ampkB=ASatH2(min(mnkB,AH2_(2.0)-mxkB)*rcpMkB);
-  #else
-   AH2 ampfR=ASatH2(min(mnfR,AH2_(1.0)-mxfR)*rcpMfR);
-   AH2 ampfG=ASatH2(min(mnfG,AH2_(1.0)-mxfG)*rcpMfG);
-   AH2 ampfB=ASatH2(min(mnfB,AH2_(1.0)-mxfB)*rcpMfB);
-   AH2 ampgR=ASatH2(min(mngR,AH2_(1.0)-mxgR)*rcpMgR);
-   AH2 ampgG=ASatH2(min(mngG,AH2_(1.0)-mxgG)*rcpMgG);
-   AH2 ampgB=ASatH2(min(mngB,AH2_(1.0)-mxgB)*rcpMgB);
-   AH2 ampjR=ASatH2(min(mnjR,AH2_(1.0)-mxjR)*rcpMjR);
-   AH2 ampjG=ASatH2(min(mnjG,AH2_(1.0)-mxjG)*rcpMjG);
-   AH2 ampjB=ASatH2(min(mnjB,AH2_(1.0)-mxjB)*rcpMjB);
-   AH2 ampkR=ASatH2(min(mnkR,AH2_(1.0)-mxkR)*rcpMkR);
-   AH2 ampkG=ASatH2(min(mnkG,AH2_(1.0)-mxkG)*rcpMkG);
-   AH2 ampkB=ASatH2(min(mnkB,AH2_(1.0)-mxkB)*rcpMkB);
-  #endif
-  // Shaping amount of sharpening.
-  #ifdef CAS_GO_SLOWER
-   ampfR=sqrt(ampfR);
-   ampfG=sqrt(ampfG);
-   ampfB=sqrt(ampfB);
-   ampgR=sqrt(ampgR);
-   ampgG=sqrt(ampgG);
-   ampgB=sqrt(ampgB);
-   ampjR=sqrt(ampjR);
-   ampjG=sqrt(ampjG);
-   ampjB=sqrt(ampjB);
-   ampkR=sqrt(ampkR);
-   ampkG=sqrt(ampkG);
-   ampkB=sqrt(ampkB);
-  #else
-   ampfR=APrxLoSqrtH2(ampfR);
-   ampfG=APrxLoSqrtH2(ampfG);
-   ampfB=APrxLoSqrtH2(ampfB);
-   ampgR=APrxLoSqrtH2(ampgR);
-   ampgG=APrxLoSqrtH2(ampgG);
-   ampgB=APrxLoSqrtH2(ampgB);
-   ampjR=APrxLoSqrtH2(ampjR);
-   ampjG=APrxLoSqrtH2(ampjG);
-   ampjB=APrxLoSqrtH2(ampjB);
-   ampkR=APrxLoSqrtH2(ampkR);
-   ampkG=APrxLoSqrtH2(ampkG);
-   ampkB=APrxLoSqrtH2(ampkB);
-  #endif
-  // Filter shape.
-  AH1 peak=AH2_AU1(const1.y).x;
-  AH2 wfR=ampfR*AH2_(peak);
-  AH2 wfG=ampfG*AH2_(peak);
-  AH2 wfB=ampfB*AH2_(peak);
-  AH2 wgR=ampgR*AH2_(peak);
-  AH2 wgG=ampgG*AH2_(peak);
-  AH2 wgB=ampgB*AH2_(peak);
-  AH2 wjR=ampjR*AH2_(peak);
-  AH2 wjG=ampjG*AH2_(peak);
-  AH2 wjB=ampjB*AH2_(peak);
-  AH2 wkR=ampkR*AH2_(peak);
-  AH2 wkG=ampkG*AH2_(peak);
-  AH2 wkB=ampkB*AH2_(peak);
-  // Blend between 4 results.
-  AH2 s=(AH2_(1.0)-ppX)*(AH2_(1.0)-AH2_(ppY));
-  AH2 t=           ppX *(AH2_(1.0)-AH2_(ppY));
-  AH2 u=(AH2_(1.0)-ppX)*           AH2_(ppY) ;
-  AH2 v=           ppX *           AH2_(ppY) ;
-  // Thin edges to hide bilinear interpolation (helps diagonals).
-  AH2 thinB=AH2_(1.0/32.0);
-  #ifdef CAS_GO_SLOWER
-   s*=ARcpH2(thinB+(mxfG-mnfG));
-   t*=ARcpH2(thinB+(mxgG-mngG));
-   u*=ARcpH2(thinB+(mxjG-mnjG));
-   v*=ARcpH2(thinB+(mxkG-mnkG));
-  #else
-   s*=APrxLoRcpH2(thinB+(mxfG-mnfG));
-   t*=APrxLoRcpH2(thinB+(mxgG-mngG));
-   u*=APrxLoRcpH2(thinB+(mxjG-mnjG));
-   v*=APrxLoRcpH2(thinB+(mxkG-mnkG));
-  #endif
-  // Final weighting.
-  AH2 qbeR=wfR*s;
-  AH2 qbeG=wfG*s;
-  AH2 qbeB=wfB*s;
-  AH2 qchR=wgR*t;
-  AH2 qchG=wgG*t;
-  AH2 qchB=wgB*t;
-  AH2 qfR=wgR*t+wjR*u+s;
-  AH2 qfG=wgG*t+wjG*u+s;
-  AH2 qfB=wgB*t+wjB*u+s;
-  AH2 qgR=wfR*s+wkR*v+t;
-  AH2 qgG=wfG*s+wkG*v+t;
-  AH2 qgB=wfB*s+wkB*v+t;
-  AH2 qjR=wfR*s+wkR*v+u;
-  AH2 qjG=wfG*s+wkG*v+u;
-  AH2 qjB=wfB*s+wkB*v+u;
-  AH2 qkR=wgR*t+wjR*u+v;
-  AH2 qkG=wgG*t+wjG*u+v;
-  AH2 qkB=wgB*t+wjB*u+v;
-  AH2 qinR=wjR*u;
-  AH2 qinG=wjG*u;
-  AH2 qinB=wjB*u;
-  AH2 qloR=wkR*v;
-  AH2 qloG=wkG*v;
-  AH2 qloB=wkB*v;
-  // Filter.
-  #ifndef CAS_SLOW
-   #ifdef CAS_GO_SLOWER
-    AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
-   #else
-    AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
-   #endif
-   pixR=ASatH2((bR*qbeG+eR*qbeG+cR*qchG+hR*qchG+iR*qinG+nR*qinG+lR*qloG+oR*qloG+fR*qfG+gR*qgG+jR*qjG+kR*qkG)*rcpWG);
-   pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
-   pixB=ASatH2((bB*qbeG+eB*qbeG+cB*qchG+hB*qchG+iB*qinG+nB*qinG+lB*qloG+oB*qloG+fB*qfG+gB*qgG+jB*qjG+kB*qkG)*rcpWG);
-  #else
-   #ifdef CAS_GO_SLOWER
-    AH2 rcpWR=ARcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
-    AH2 rcpWG=ARcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
-    AH2 rcpWB=ARcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
-   #else
-    AH2 rcpWR=APrxMedRcpH2(AH2_(2.0)*qbeR+AH2_(2.0)*qchR+AH2_(2.0)*qinR+AH2_(2.0)*qloR+qfR+qgR+qjR+qkR);
-    AH2 rcpWG=APrxMedRcpH2(AH2_(2.0)*qbeG+AH2_(2.0)*qchG+AH2_(2.0)*qinG+AH2_(2.0)*qloG+qfG+qgG+qjG+qkG);
-    AH2 rcpWB=APrxMedRcpH2(AH2_(2.0)*qbeB+AH2_(2.0)*qchB+AH2_(2.0)*qinB+AH2_(2.0)*qloB+qfB+qgB+qjB+qkB);
-   #endif
-   pixR=ASatH2((bR*qbeR+eR*qbeR+cR*qchR+hR*qchR+iR*qinR+nR*qinR+lR*qloR+oR*qloR+fR*qfR+gR*qgR+jR*qjR+kR*qkR)*rcpWR);
-   pixG=ASatH2((bG*qbeG+eG*qbeG+cG*qchG+hG*qchG+iG*qinG+nG*qinG+lG*qloG+oG*qloG+fG*qfG+gG*qgG+jG*qjG+kG*qkG)*rcpWG);
-   pixB=ASatH2((bB*qbeB+eB*qbeB+cB*qchB+hB*qchB+iB*qinB+nB*qinB+lB*qloB+oB*qloB+fB*qfB+gB*qgB+jB*qjB+kB*qkB)*rcpWB);
-  #endif
- }
-#endif
 
 #ifdef A_GPU
 void main()
@@ -3735,7 +2546,7 @@ void main()
     vec4 diff = vec4(0.f);
     uvec2 point = uvec2(vary_fragcoord * out_screen_res.xy);
     CasFilter(diff.r, diff.g, diff.b, point, cas_param_0, cas_param_1, true);
-    diff.a = textureLod(diffuseRect, vary_fragcoord, 0.0f).a;
+    diff.a = texture(diffuseRect, vary_fragcoord).a;
     frag_color = diff;
 }
 #endif
-- 
GitLab