removed some implementations
@@ -57,8 +57,8 @@ | ||
57 | 57 | // const size_t nThreads = 2; |
58 | 58 | // const size_t nThreads = 4; |
59 | 59 | #else |
60 | -// const size_t nThreads = si.dwNumberOfProcessors; | |
61 | - const size_t nThreads = 1; | |
60 | + const size_t nThreads = si.dwNumberOfProcessors; | |
61 | +// const size_t nThreads = 1; | |
62 | 62 | #endif |
63 | 63 | Threads<blur_1b::Parameter> threads; |
64 | 64 | threads.SetUp(nThreads); |
@@ -106,10 +106,6 @@ | ||
106 | 106 | blur_1b::test_10, |
107 | 107 | blur_1b::test_11, |
108 | 108 | blur_1b::test_12, |
109 | - blur_1b::test_13, | |
110 | - blur_1b::test_14, | |
111 | - blur_1b::test_15, | |
112 | - blur_1b::test_16, | |
113 | 109 | |
114 | 110 | //blur_1b::test_20, |
115 | 111 | //blur_1b::test_21, |
@@ -1940,497 +1940,6 @@ | ||
1940 | 1940 | |
1941 | 1941 | } |
1942 | 1942 | |
1943 | -static __forceinline | |
1944 | -void | |
1945 | -repeatShiftSum3(__m128i m01, __m128i& m0, __m128i& m1, __m128i& remain0) | |
1946 | -{ | |
1947 | - m0 = _mm_unpacklo_epi8(m01, _mm_setzero_si128()); | |
1948 | - m1 = _mm_unpackhi_epi8(m01, _mm_setzero_si128()); | |
1949 | -#if 1 | |
1950 | - remain0 = _mm_srli_si128(_mm_add_epi16(m1, _mm_srli_si128(m1, 2)), 12); | |
1951 | - | |
1952 | - __m128i m01_1 = _mm_slli_si128(m01, 1); | |
1953 | - __m128i m01_2 = _mm_slli_si128(m01, 2); | |
1954 | - m0 = | |
1955 | - _mm_add_epi16( | |
1956 | - _mm_add_epi16( | |
1957 | - m0, | |
1958 | - _mm_unpacklo_epi8(m01_1, _mm_setzero_si128()) | |
1959 | - ), | |
1960 | - _mm_unpacklo_epi8(m01_2, _mm_setzero_si128()) | |
1961 | - ); | |
1962 | - m1 = | |
1963 | - _mm_add_epi16( | |
1964 | - _mm_add_epi16( | |
1965 | - m1, | |
1966 | - _mm_unpackhi_epi8(m01_1, _mm_setzero_si128()) | |
1967 | - ), | |
1968 | - _mm_unpackhi_epi8(m01_2, _mm_setzero_si128()) | |
1969 | - ); | |
1970 | -#else | |
1971 | - __m128i s0L = _mm_add_epi16(_mm_add_epi16(m0, _mm_slli_si128(m0, 2)), _mm_slli_si128(m0, 4)); | |
1972 | - __m128i s1L = _mm_add_epi16(_mm_add_epi16(m1, _mm_slli_si128(m1, 2)), _mm_slli_si128(m1, 4)); | |
1973 | - __m128i s0R = _mm_srli_si128(_mm_add_epi16(m0, _mm_srli_si128(m0, 2)), 12); | |
1974 | - __m128i s1R = _mm_srli_si128(_mm_add_epi16(m1, _mm_srli_si128(m1, 2)), 12); | |
1975 | - | |
1976 | - m0 = s0L; | |
1977 | - m1 = _mm_add_epi16(s1L, s0R); | |
1978 | - remain0 = s1R; | |
1979 | -#endif | |
1980 | -} | |
1981 | - | |
1982 | -#if _MSC_VER >= 1700 | |
1983 | - | |
1984 | -static __forceinline | |
1985 | -void repeatShiftSum3(__m256i m01, __m256i& m0, __m256i& m1, __m256i& remain0) | |
1986 | -{ | |
1987 | - m0 = _mm256_unpacklo_epi8(m01, _mm256_setzero_si256()); | |
1988 | - m1 = _mm256_unpackhi_epi8(m01, _mm256_setzero_si256()); | |
1989 | - remain0 = _mm256_srli_si256(_mm256_add_epi16(m1, _mm256_srli_si256(m1, 2)), 28); | |
1990 | - | |
1991 | - __m256i m01_1 = _mm256_slli_si256(m01, 1); | |
1992 | - __m256i m01_2 = _mm256_slli_si256(m01, 2); | |
1993 | - m0 = | |
1994 | - _mm256_add_epi16( | |
1995 | - _mm256_add_epi16( | |
1996 | - m0, | |
1997 | - _mm256_unpacklo_epi8(m01_1, _mm256_setzero_si256()) | |
1998 | - ), | |
1999 | - _mm256_unpacklo_epi8(m01_2, _mm256_setzero_si256()) | |
2000 | - ); | |
2001 | - m1 = | |
2002 | - _mm256_add_epi16( | |
2003 | - _mm256_add_epi16( | |
2004 | - m1, | |
2005 | - _mm256_unpackhi_epi8(m01_1, _mm256_setzero_si256()) | |
2006 | - ), | |
2007 | - _mm256_unpackhi_epi8(m01_2, _mm256_setzero_si256()) | |
2008 | - ); | |
2009 | - | |
2010 | -} | |
2011 | - | |
2012 | -#endif | |
2013 | - | |
2014 | -template <size_t SHIFTS> | |
2015 | -static __forceinline | |
2016 | -void repeatShiftSum(__m128i main01, __m128i& main0, __m128i& main1, __m128i& remain0) | |
2017 | -{ | |
2018 | - | |
2019 | - // TODO: efficiently construct slided sum | |
2020 | - // 2 ^ 2 ^ 2 = 8 | |
2021 | - main0 = _mm_unpacklo_epi8(main01, _mm_setzero_si128()); | |
2022 | - main1 = _mm_unpackhi_epi8(main01, _mm_setzero_si128()); | |
2023 | - | |
2024 | - __m128i m0L = main0; | |
2025 | - __m128i m0R = main0; | |
2026 | - __m128i m1L = main1; | |
2027 | - __m128i m1R = main1; | |
2028 | - | |
2029 | - __m128i s0L = main0; | |
2030 | - __m128i s0R = main0; | |
2031 | - __m128i s1L = main1; | |
2032 | - __m128i s1R = main1; | |
2033 | - | |
2034 | - for (size_t i=0; i<SHIFTS-1; ++i) { | |
2035 | - m0L = _mm_slli_si128(m0L, 2); | |
2036 | - m0R = _mm_srli_si128(m0R, 2); | |
2037 | - m1L = _mm_slli_si128(m1L, 2); | |
2038 | - m1R = _mm_srli_si128(m1R, 2); | |
2039 | - s0L = _mm_add_epi16(s0L, m0L); | |
2040 | - s0R = _mm_add_epi16(s0R, m0R); | |
2041 | - s1L = _mm_add_epi16(s1L, m1L); | |
2042 | - s1R = _mm_add_epi16(s1R, m1R); | |
2043 | - } | |
2044 | - | |
2045 | - main0 = s0L; | |
2046 | - s0R = _mm_srli_si128(s0R, (8-(SHIFTS-1))*2); | |
2047 | - main1 = _mm_add_epi16(s1L, s0R); | |
2048 | - remain0 = _mm_srli_si128(s1R, (8-(SHIFTS-1))*2);; | |
2049 | -} | |
2050 | - | |
2051 | -static __forceinline | |
2052 | -void repeatShiftNum(__m128i main01, __m128i& main0, __m128i& main1, __m128i& remain0, size_t count) | |
2053 | -{ | |
2054 | - switch (count) { | |
2055 | - case 3: | |
2056 | - repeatShiftSum3(main01, main0, main1, remain0); | |
2057 | - break; | |
2058 | - case 5: | |
2059 | - repeatShiftSum<5>(main01, main0, main1, remain0); | |
2060 | - break; | |
2061 | - case 7: | |
2062 | - repeatShiftSum<7>(main01, main0, main1, remain0); | |
2063 | - break; | |
2064 | - case 9: | |
2065 | - repeatShiftSum<9>(main01, main0, main1, remain0); | |
2066 | - break; | |
2067 | - default: | |
2068 | - break; | |
2069 | - } | |
2070 | -} | |
2071 | - | |
2072 | -void test_12(const Parameter& p) { | |
2073 | - | |
2074 | - BLUR_EXTRACT_PARAMS; | |
2075 | - | |
2076 | - uint32_t hRad = p.radius; | |
2077 | - uint32_t vRad = p.radius; | |
2078 | - uint32_t hLen = 1 + hRad*2; | |
2079 | - uint32_t vLen = 1 + vRad*2; | |
2080 | - uint32_t invLen = 0xFFFFFF / (hLen*vLen); | |
2081 | - uint32_t hCount = p.width; | |
2082 | - uint32_t vCount = p.height; | |
2083 | - | |
2084 | - const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / (hLen*vLen)); | |
2085 | - | |
2086 | - if (hLen > 9) { | |
2087 | - return; | |
2088 | - } | |
2089 | - | |
2090 | - const uint8_t* hLine = p.pSrc; | |
2091 | - uint8_t* vLine = p.pDest; | |
2092 | - OffsetPtr(vLine, destLineOffsetBytes * vRad); | |
2093 | - | |
2094 | - uint16_t* vSumLine = (uint16_t*)pWork2; | |
2095 | - assert((ptrdiff_t)vSumLine % 16 == 0); | |
2096 | - assert((width * 2) % 16 == 0); | |
2097 | - | |
2098 | - RingLinePtr<uint16_t*> vMinusLine(vLen, 0, (uint16_t*)pWork, width*2); | |
2099 | - RingLinePtr<uint16_t*> vPlusLine(vLen, 0, (uint16_t*)pWork, width*2); | |
2100 | - | |
2101 | - // vTop collect | |
2102 | - { | |
2103 | - const uint8_t* hMinus = hLine; | |
2104 | - const uint8_t* hPlus = hLine+hLen; | |
2105 | - size_t hSum = 0; | |
2106 | - // hLeft collect | |
2107 | - for (size_t x=0; x<hLen; ++x) { | |
2108 | - hSum += hLine[x]; | |
2109 | - } | |
2110 | - // hCenter | |
2111 | - for (size_t x=hRad; x<hCount-hRad; ++x) { | |
2112 | - hSum -= *hMinus++; | |
2113 | - hSum += *hPlus++; | |
2114 | - vPlusLine[x] = hSum; | |
2115 | - vSumLine[x] = hSum; | |
2116 | - } | |
2117 | - // hRight | |
2118 | - ; | |
2119 | - OffsetPtr(hLine, srcLineOffsetBytes); | |
2120 | - vPlusLine.moveNext(); | |
2121 | - } | |
2122 | - for (size_t y=1; y<vLen; ++y) { | |
2123 | - const uint8_t* hMinus = hLine; | |
2124 | - const uint8_t* hPlus = hLine+hLen; | |
2125 | - size_t hSum = 0; | |
2126 | - // hLeft collect | |
2127 | - for (size_t x=0; x<hLen; ++x) { | |
2128 | - hSum += hLine[x]; | |
2129 | - } | |
2130 | - // hCenter | |
2131 | - for (size_t x=hRad; x<hCount-hRad; ++x) { | |
2132 | - hSum -= *hMinus++; | |
2133 | - hSum += *hPlus++; | |
2134 | - vPlusLine[x] = hSum; | |
2135 | - vSumLine[x] += hSum; | |
2136 | - } | |
2137 | - // hRight | |
2138 | - ; | |
2139 | - OffsetPtr(hLine, srcLineOffsetBytes); | |
2140 | - vPlusLine.moveNext(); | |
2141 | - } | |
2142 | - | |
2143 | - __m128i* mvSumLine = (__m128i*)vSumLine; | |
2144 | - | |
2145 | - // vMiddle | |
2146 | - for (size_t y=vRad; y<vCount-vLen; ++y) { | |
2147 | - | |
2148 | - assert((ptrdiff_t)hLine % 16 == 0); | |
2149 | - | |
2150 | - const __m128i* mhLine = (const __m128i*)hLine; | |
2151 | - __m128i m01 = mhLine[0]; | |
2152 | - __m128i m0 = _mm_unpacklo_epi8(m01, _mm_setzero_si128()); | |
2153 | - __m128i m1 = _mm_unpackhi_epi8(m01, _mm_setzero_si128()); | |
2154 | - __m128i m23 = mhLine[1]; | |
2155 | - | |
2156 | - // hLeft collect | |
2157 | - __m128i m1r; | |
2158 | - | |
2159 | - __m128i* mvLine = (__m128i*) vLine; | |
2160 | - __m128i* mvMinusLine = vMinusLine; | |
2161 | - __m128i* mvPlusLine = vMinusLine; | |
2162 | - __m128i m0l = _mm_setzero_si128(); | |
2163 | - | |
2164 | - // hCenter | |
2165 | - const size_t loopCount = hCount / 16; | |
2166 | - for (size_t i=0; i<loopCount; ++i) { | |
2167 | - repeatShiftNum(mhLine[i], m0, m1, m1r, hLen); | |
2168 | - m0 = _mm_add_epi16(m0l, m0); | |
2169 | - __m128i sum0 = mvSumLine[i*2+0]; | |
2170 | - __m128i sum1 = mvSumLine[i*2+1]; | |
2171 | - __m128i minus0 = mvMinusLine[i*2+0]; | |
2172 | - __m128i minus1 = mvMinusLine[i*2+1]; | |
2173 | - sum0 = _mm_sub_epi16(sum0, minus0); | |
2174 | - sum1 = _mm_sub_epi16(sum1, minus1); | |
2175 | - mvPlusLine[i*2+0] = m0; | |
2176 | - mvPlusLine[i*2+1] = m1; | |
2177 | - sum0 = _mm_add_epi16(sum0, m0); | |
2178 | - sum1 = _mm_add_epi16(sum1, m1); | |
2179 | - mvSumLine[i*2+0] = sum0; | |
2180 | - mvSumLine[i*2+1] = sum1; | |
2181 | - | |
2182 | - _mm_stream_si128(mvLine+i, _mm_packus_epi16( | |
2183 | - _mm_mulhi_epu16(sum0, mInvRatio), | |
2184 | - _mm_mulhi_epu16(sum1, mInvRatio) | |
2185 | - )); | |
2186 | - m0l = m1r; | |
2187 | - } | |
2188 | - // hRight | |
2189 | - ; | |
2190 | - OffsetPtr(hLine, srcLineOffsetBytes); | |
2191 | - OffsetPtr(vLine, destLineOffsetBytes); | |
2192 | - vMinusLine.moveNext(); | |
2193 | - vPlusLine.moveNext(); | |
2194 | - } | |
2195 | - | |
2196 | -} | |
2197 | - | |
2198 | -void test_13(const Parameter& p) { | |
2199 | - | |
2200 | - BLUR_EXTRACT_PARAMS; | |
2201 | - | |
2202 | - uint32_t hRad = p.radius; | |
2203 | - uint32_t vRad = p.radius; | |
2204 | - uint32_t hLen = 1 + hRad*2; | |
2205 | - uint32_t vLen = 1 + vRad*2; | |
2206 | - uint32_t invLen = 0xFFFFFF / (hLen*vLen); | |
2207 | - uint32_t hCount = p.width; | |
2208 | - uint32_t vCount = p.height; | |
2209 | - | |
2210 | - static const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / 9); | |
2211 | - | |
2212 | - if (vRad != 1) { | |
2213 | - return; | |
2214 | - } | |
2215 | - | |
2216 | - const uint8_t* hLine = p.pSrc; | |
2217 | - uint8_t* vLine = p.pDest; | |
2218 | - OffsetPtr(vLine, destLineOffsetBytes * vRad); | |
2219 | - | |
2220 | - uint16_t* vSumLine = (uint16_t*)pWork2; | |
2221 | - assert((ptrdiff_t)vSumLine % 16 == 0); | |
2222 | - assert((width * 2) % 16 == 0); | |
2223 | - | |
2224 | - int* remains = (int*)p.pWork; | |
2225 | -#if 1 | |
2226 | - const __m128i* mpSrc = (const __m128i*)pSrc; | |
2227 | - __m128i* mpDst = (__m128i*)pDest; | |
2228 | - for (size_t i=0; i<hCount/64; ++i) { | |
2229 | - const __m128i* src = mpSrc; | |
2230 | - __m128i* dst = mpDst; | |
2231 | - __m128i sums0 = _mm_setzero_si128(); | |
2232 | - __m128i sums1 = _mm_setzero_si128(); | |
2233 | - __m128i sums2 = _mm_setzero_si128(); | |
2234 | - __m128i sums3 = _mm_setzero_si128(); | |
2235 | - __m128i sums4 = _mm_setzero_si128(); | |
2236 | - __m128i sums5 = _mm_setzero_si128(); | |
2237 | - __m128i sums6 = _mm_setzero_si128(); | |
2238 | - __m128i sums7 = _mm_setzero_si128(); | |
2239 | - __m128i adds0 = _mm_setzero_si128(); | |
2240 | - __m128i adds1 = _mm_setzero_si128(); | |
2241 | - __m128i adds2 = _mm_setzero_si128(); | |
2242 | - __m128i adds3 = _mm_setzero_si128(); | |
2243 | - __m128i adds4 = _mm_setzero_si128(); | |
2244 | - __m128i adds5 = _mm_setzero_si128(); | |
2245 | - __m128i adds6 = _mm_setzero_si128(); | |
2246 | - __m128i adds7 = _mm_setzero_si128(); | |
2247 | - __m128i mids0 = _mm_setzero_si128(); | |
2248 | - __m128i mids1 = _mm_setzero_si128(); | |
2249 | - __m128i mids2 = _mm_setzero_si128(); | |
2250 | - __m128i mids3 = _mm_setzero_si128(); | |
2251 | - __m128i mids4 = _mm_setzero_si128(); | |
2252 | - __m128i mids5 = _mm_setzero_si128(); | |
2253 | - __m128i mids6 = _mm_setzero_si128(); | |
2254 | - __m128i mids7 = _mm_setzero_si128(); | |
2255 | - __m128i remain0 = _mm_setzero_si128(); | |
2256 | - __m128i remain1 = _mm_setzero_si128(); | |
2257 | - __m128i remain2 = _mm_setzero_si128(); | |
2258 | - __m128i nsrc0 = src[0]; | |
2259 | - __m128i nsrc1 = src[1]; | |
2260 | - __m128i nsrc2 = src[2]; | |
2261 | - __m128i nsrc3 = src[3]; | |
2262 | - for (size_t y=0; y<vCount; ++y) { | |
2263 | - sums0 = _mm_sub_epi16(sums0, mids0); | |
2264 | - sums1 = _mm_sub_epi16(sums1, mids1); | |
2265 | - sums2 = _mm_sub_epi16(sums2, mids2); | |
2266 | - sums3 = _mm_sub_epi16(sums3, mids3); | |
2267 | - sums4 = _mm_sub_epi16(sums4, mids4); | |
2268 | - sums5 = _mm_sub_epi16(sums5, mids5); | |
2269 | - sums6 = _mm_sub_epi16(sums6, mids6); | |
2270 | - sums7 = _mm_sub_epi16(sums7, mids7); | |
2271 | - | |
2272 | - mids0 = adds0; | |
2273 | - mids1 = adds1; | |
2274 | - mids2 = adds2; | |
2275 | - mids3 = adds3; | |
2276 | - mids4 = adds4; | |
2277 | - mids5 = adds5; | |
2278 | - mids6 = adds6; | |
2279 | - mids7 = adds7; | |
2280 | - | |
2281 | - __m128i src0 = nsrc0; | |
2282 | - __m128i src1 = nsrc1; | |
2283 | - __m128i src2 = nsrc2; | |
2284 | - __m128i src3 = nsrc3; | |
2285 | - nsrc0 = src[4]; | |
2286 | - nsrc1 = src[5]; | |
2287 | - nsrc2 = src[6]; | |
2288 | - nsrc3 = src[7]; | |
2289 | - | |
2290 | - __m128i remain = _mm_cvtsi32_si128(remains[y]); | |
2291 | - repeatShiftSum3(src0, adds0, adds1, remain0); | |
2292 | - adds0 = _mm_add_epi16(adds0, remain); | |
2293 | - repeatShiftSum3(src1, adds2, adds3, remain1); | |
2294 | - repeatShiftSum3(src2, adds4, adds5, remain2); | |
2295 | - repeatShiftSum3(src3, adds6, adds7, remain); | |
2296 | - remains[y] = _mm_cvtsi128_si32(remain); | |
2297 | - adds2 = _mm_add_epi16(adds2, remain0); | |
2298 | - adds4 = _mm_add_epi16(adds4, remain1); | |
2299 | - adds6 = _mm_add_epi16(adds6, remain2); | |
2300 | - | |
2301 | - sums0 = _mm_add_epi16(sums0, adds0); | |
2302 | - sums1 = _mm_add_epi16(sums1, adds1); | |
2303 | - sums2 = _mm_add_epi16(sums2, adds2); | |
2304 | - sums3 = _mm_add_epi16(sums3, adds3); | |
2305 | - sums4 = _mm_add_epi16(sums4, adds4); | |
2306 | - sums5 = _mm_add_epi16(sums5, adds5); | |
2307 | - sums6 = _mm_add_epi16(sums6, adds6); | |
2308 | - sums7 = _mm_add_epi16(sums7, adds7); | |
2309 | - | |
2310 | - __m128i result0 = _mm_packus_epi16(_mm_mulhi_epu16(sums0, mInvRatio), _mm_mulhi_epu16(sums1, mInvRatio)); | |
2311 | - __m128i result1 = _mm_packus_epi16(_mm_mulhi_epu16(sums2, mInvRatio), _mm_mulhi_epu16(sums3, mInvRatio)); | |
2312 | - __m128i result2 = _mm_packus_epi16(_mm_mulhi_epu16(sums4, mInvRatio), _mm_mulhi_epu16(sums5, mInvRatio)); | |
2313 | - __m128i result3 = _mm_packus_epi16(_mm_mulhi_epu16(sums6, mInvRatio), _mm_mulhi_epu16(sums7, mInvRatio)); | |
2314 | - _mm_stream_si128(dst+0, result0); | |
2315 | - _mm_stream_si128(dst+1, result1); | |
2316 | - _mm_stream_si128(dst+2, result2); | |
2317 | - _mm_stream_si128(dst+3, result3); | |
2318 | - OffsetPtr(dst, destLineOffsetBytes); | |
2319 | - OffsetPtr(src, destLineOffsetBytes); | |
2320 | - } | |
2321 | - mpSrc += 4; | |
2322 | - mpDst += 4; | |
2323 | - } | |
2324 | -#else | |
2325 | - memcpy(pDest, pSrc, vCount*hCount); | |
2326 | - //__m256i* __restrict mpSrc = (__m256i* __restrict)&pSrc[0]; | |
2327 | - //__m256i* __restrict mpDst = (__m256i* __restrict)&pDest[0]; | |
2328 | - //for (size_t i=0; i<vCount*hCount/64; ++i) { | |
2329 | - // _mm256_stream_si256(&mpDst[i*2+0], _mm256_load_si256(&mpSrc[i*2+0])); | |
2330 | - // _mm256_stream_si256(&mpDst[i*2+1], _mm256_load_si256(&mpSrc[i*2+1])); | |
2331 | - //} | |
2332 | -#endif | |
2333 | - | |
2334 | -} | |
2335 | - | |
2336 | -void test_14(const Parameter& p) { | |
2337 | -#if _MSC_VER >= 1700 | |
2338 | - | |
2339 | - BLUR_EXTRACT_PARAMS; | |
2340 | - | |
2341 | - uint32_t hRad = p.radius; | |
2342 | - uint32_t vRad = p.radius; | |
2343 | - uint32_t hLen = 1 + hRad*2; | |
2344 | - uint32_t vLen = 1 + vRad*2; | |
2345 | - uint32_t invLen = 0xFFFFFF / (hLen*vLen); | |
2346 | - uint32_t hCount = p.width; | |
2347 | - uint32_t vCount = p.height; | |
2348 | - | |
2349 | - _mm256_zeroall(); | |
2350 | - | |
2351 | - static const __m256i mInvRatio = _mm256_set1_epi16(0xFFFF / 9); | |
2352 | - | |
2353 | - if (vRad != 1) { | |
2354 | - return; | |
2355 | - } | |
2356 | - | |
2357 | - const uint8_t* hLine = p.pSrc; | |
2358 | - uint8_t* vLine = p.pDest; | |
2359 | - OffsetPtr(vLine, destLineOffsetBytes * vRad); | |
2360 | - | |
2361 | - uint16_t* vSumLine = (uint16_t*)pWork2; | |
2362 | - assert((ptrdiff_t)vSumLine % 16 == 0); | |
2363 | - assert((width * 2) % 16 == 0); | |
2364 | - | |
2365 | - int* remains = (int*)p.pWork; | |
2366 | - const __m256i* mpSrc = (const __m256i*)pSrc; | |
2367 | - __m256i* mpDst = (__m256i*)pDest; | |
2368 | - for (size_t i=0; i<width/64; ++i) { | |
2369 | - const __m256i* src = mpSrc; | |
2370 | - __m256i* dst = mpDst; | |
2371 | - __m256i sums0 = _mm256_setzero_si256(); | |
2372 | - __m256i sums1 = _mm256_setzero_si256(); | |
2373 | - __m256i sums2 = _mm256_setzero_si256(); | |
2374 | - __m256i sums3 = _mm256_setzero_si256(); | |
2375 | - __m256i adds0 = _mm256_setzero_si256(); | |
2376 | - __m256i adds1 = _mm256_setzero_si256(); | |
2377 | - __m256i adds2 = _mm256_setzero_si256(); | |
2378 | - __m256i adds3 = _mm256_setzero_si256(); | |
2379 | - __m256i mids0 = _mm256_setzero_si256(); | |
2380 | - __m256i mids1 = _mm256_setzero_si256(); | |
2381 | - __m256i mids2 = _mm256_setzero_si256(); | |
2382 | - __m256i mids3 = _mm256_setzero_si256(); | |
2383 | - __m256i remain0 = _mm256_setzero_si256(); | |
2384 | - __m256i remain1 = _mm256_setzero_si256(); | |
2385 | - __m256i remain2 = _mm256_setzero_si256(); | |
2386 | - __m256i nsrc0 = src[0]; | |
2387 | - __m256i nsrc1 = src[1]; | |
2388 | - for (size_t y=0; y<vCount; ++y) { | |
2389 | - sums0 = _mm256_sub_epi16(sums0, mids0); | |
2390 | - sums1 = _mm256_sub_epi16(sums1, mids1); | |
2391 | - sums2 = _mm256_sub_epi16(sums2, mids2); | |
2392 | - sums3 = _mm256_sub_epi16(sums3, mids3); | |
2393 | - | |
2394 | - mids0 = adds0; | |
2395 | - mids1 = adds1; | |
2396 | - mids2 = adds2; | |
2397 | - mids3 = adds3; | |
2398 | - | |
2399 | - __m256i src0 = nsrc0; | |
2400 | - __m256i src1 = nsrc1; | |
2401 | - nsrc0 = src[2]; | |
2402 | - nsrc1 = src[3]; | |
2403 | - | |
2404 | - __m256i remain = _mm256_castsi128_si256(_mm_cvtsi32_si128(remains[y])); | |
2405 | - repeatShiftSum3(src0, adds0, adds1, remain0); | |
2406 | - adds0 = _mm256_add_epi16(adds0, remain); | |
2407 | - repeatShiftSum3(src1, adds2, adds3, remain1); | |
2408 | - remains[y] = _mm_cvtsi128_si32(_mm256_castsi256_si128(remain)); | |
2409 | - adds2 = _mm256_add_epi16(adds2, remain0); | |
2410 | - | |
2411 | - sums0 = _mm256_add_epi16(sums0, adds0); | |
2412 | - sums1 = _mm256_add_epi16(sums1, adds1); | |
2413 | - sums2 = _mm256_add_epi16(sums2, adds2); | |
2414 | - sums3 = _mm256_add_epi16(sums3, adds3); | |
2415 | - | |
2416 | - __m256i result0 = _mm256_packus_epi16(_mm256_mulhi_epu16(sums0, mInvRatio), _mm256_mulhi_epu16(sums1, mInvRatio)); | |
2417 | - __m256i result1 = _mm256_packus_epi16(_mm256_mulhi_epu16(sums2, mInvRatio), _mm256_mulhi_epu16(sums3, mInvRatio)); | |
2418 | -#if 1 | |
2419 | - _mm256_stream_si256(dst+0, result0); | |
2420 | - _mm256_stream_si256(dst+1, result1); | |
2421 | -#else | |
2422 | - dst[0] = result0; | |
2423 | - dst[1] = result1; | |
2424 | -#endif | |
2425 | - OffsetPtr(dst, destLineOffsetBytes); | |
2426 | - OffsetPtr(src, destLineOffsetBytes); | |
2427 | - } | |
2428 | - mpSrc += 2; | |
2429 | - mpDst += 2; | |
2430 | - } | |
2431 | -#endif | |
2432 | -} | |
2433 | - | |
2434 | 1943 | __forceinline |
2435 | 1944 | __m128i shiftAdd16(__m128i v) |
2436 | 1945 | { |
@@ -2444,181 +1953,6 @@ | ||
2444 | 1953 | return v; |
2445 | 1954 | } |
2446 | 1955 | |
2447 | -__forceinline | |
2448 | -void hProcess(const __m128i* src, __m128i* dst, size_t width, uint8_t len) | |
2449 | -{ | |
2450 | - // 現在の実装には制限有り | |
2451 | - if (len > 7 || len == 0) { | |
2452 | - return; | |
2453 | - } | |
2454 | - size_t vcnt = width / 16; | |
2455 | - if (vcnt == 0) { | |
2456 | - return; | |
2457 | - } | |
2458 | - // 左側画面外の要素を反転して生成 | |
2459 | - __m128i prev; | |
2460 | - __m128i cur; | |
2461 | - __m128i sum0 = _mm_setzero_si128(); | |
2462 | - | |
2463 | - cur = src[0]; // 0-15 | |
2464 | - const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); | |
2465 | - prev = _mm_shuffle_epi8(cur, REVERSE); // 15-1 | |
2466 | - | |
2467 | - // shift masks | |
2468 | - static const __m128i M[7*2] = { | |
2469 | - // 1 | |
2470 | - {13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev | |
2471 | - {-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,+9,10,11,12,}, // cur | |
2472 | - // 2 | |
2473 | - {11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev | |
2474 | - {-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,+9,10,}, // cur | |
2475 | - // 3 | |
2476 | - {+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev | |
2477 | - {-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,}, // cur | |
2478 | - // 4 | |
2479 | - {+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,}, // prev | |
2480 | - {-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,}, // cur | |
2481 | - // 5 | |
2482 | - {+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,}, // prev | |
2483 | - {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,}, // cur | |
2484 | - // 6 | |
2485 | - {+3,+4,+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,}, // prev | |
2486 | - {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,}, // cur | |
2487 | - // 7 | |
2488 | - {+1,+2,+3,+4,+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,}, // prev | |
2489 | - {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,}, // cur | |
2490 | - }; | |
2491 | - const size_t baseIdx = (len - 1) * 2; | |
2492 | - const __m128i prevMask = M[baseIdx+0]; | |
2493 | - const __m128i curMask = M[baseIdx+1]; | |
2494 | - | |
2495 | - const __m128i invLen = _mm_set1_epi16(0xFFFF / (1+len*2)); | |
2496 | - const __m128i mask7 = { | |
2497 | - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, | |
2498 | - }; | |
2499 | - | |
2500 | - uint16_t sum = 0; | |
2501 | - for (size_t i=0; i<1+len*2; ++i) { | |
2502 | - sum += ((const uint8_t*)src)[1+i]; | |
2503 | - } | |
2504 | - sum0 = _mm_set1_epi16(sum); | |
2505 | - | |
2506 | - for (size_t i=0; i<vcnt; ++i) { | |
2507 | - __m128i next = src[i+1]; | |
2508 | - __m128i prev2 = _mm_shuffle_epi8(prev, prevMask); | |
2509 | - __m128i cur2 = _mm_shuffle_epi8(cur, curMask); | |
2510 | - __m128i minus = _mm_or_si128(prev2, cur2); | |
2511 | - | |
2512 | - __m128i plus0 = _mm_unpacklo_epi8(cur, _mm_setzero_si128()); | |
2513 | - __m128i plus1 = _mm_unpackhi_epi8(cur, _mm_setzero_si128()); | |
2514 | - __m128i minus0 = _mm_unpacklo_epi8(minus, _mm_setzero_si128()); | |
2515 | - __m128i minus1 = _mm_unpackhi_epi8(minus, _mm_setzero_si128()); | |
2516 | - __m128i diff0 = _mm_sub_epi16(plus0, minus0); | |
2517 | - __m128i diff1 = _mm_sub_epi16(plus1, minus1); | |
2518 | - | |
2519 | - diff0 = shiftAdd16(diff0); | |
2520 | - sum0 = _mm_add_epi16(sum0, diff0); | |
2521 | - | |
2522 | - __m128i sum1 = _mm_shuffle_epi8(sum0, mask7); | |
2523 | - diff1 = shiftAdd16(diff1); | |
2524 | - sum1 = _mm_add_epi16(sum1, diff1); | |
2525 | - | |
2526 | - __m128i bytes = _mm_packus_epi16(_mm_mulhi_epu16(sum0, invLen), _mm_mulhi_epu16(sum1, invLen)); | |
2527 | -// _mm_stream_si128(dst+i, bytes); | |
2528 | - dst[i] = bytes; | |
2529 | -// dst[i] = src[i]; | |
2530 | - prev = cur; | |
2531 | - cur = next; | |
2532 | - sum0 = _mm_shuffle_epi8(sum1, mask7); | |
2533 | - } | |
2534 | - | |
2535 | -} | |
2536 | - | |
2537 | -/* | |
2538 | -#if 0 | |
2539 | - __m128i sum0 = _mm_shufflehi_epi16(sum1, _MM_SHUFFLE(3,3,3,3)); | |
2540 | - sum0 = _mm_unpackhi_epi64(sum0, sum0); | |
2541 | -#else | |
2542 | - __m128i sum0 = _mm_shuffle_epi8(sum1, MASK7); | |
2543 | -#endif | |
2544 | - | |
2545 | -*/ | |
2546 | - | |
2547 | -__forceinline | |
2548 | -void hProcess2(const __m128i* src, __m128i* dst, size_t width, uint8_t len) | |
2549 | -{ | |
2550 | - // 現在の実装には制限有り | |
2551 | - if (len > 14 || len == 0) { | |
2552 | - return; | |
2553 | - } | |
2554 | - size_t vcnt = width / 16; | |
2555 | - if (vcnt == 0) { | |
2556 | - return; | |
2557 | - } | |
2558 | - | |
2559 | - static const __m128i MASK7 = { | |
2560 | - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, | |
2561 | - }; | |
2562 | - const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); | |
2563 | - const __m128i invLen = _mm_set1_epi16(0xFFFF / (1+len*2)); | |
2564 | - | |
2565 | - // 反転して要素を生成 | |
2566 | - __m128i minusSrc2[2]; | |
2567 | - minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1 | |
2568 | - minusSrc2[1] = src[0]; | |
2569 | - uint16_t sum = 0; | |
2570 | - for (size_t i=0; i<1+len*2; ++i) { | |
2571 | - sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)]; | |
2572 | - } | |
2573 | - __m128i sum1 = _mm_set1_epi16(sum); | |
2574 | - const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len); | |
2575 | - const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1)); | |
2576 | - __m128i plus = _mm_loadu_si128(plusSrc); | |
2577 | - __m128i minus = _mm_loadu_si128( | |
2578 | - (const __m128i*) ( | |
2579 | - (const uint8_t*)(minusSrc2+1) - (len+1) | |
2580 | - ) | |
2581 | - ); | |
2582 | - | |
2583 | - for (size_t i=0; i<vcnt; ++i) { | |
2584 | - __m128i nextPlus = _mm_loadu_si128(plusSrc+i+1); | |
2585 | - __m128i nextMinus = _mm_loadu_si128(minusSrc+i+1); | |
2586 | - | |
2587 | - __m128i sum0 = _mm_shuffle_epi8(sum1, MASK7); | |
2588 | - __m128i plus0 = _mm_unpacklo_epi8(plus, _mm_setzero_si128()); | |
2589 | - __m128i minus0 = _mm_unpacklo_epi8(minus, _mm_setzero_si128()); | |
2590 | - __m128i plus1 = _mm_unpackhi_epi8(plus, _mm_setzero_si128()); | |
2591 | - __m128i minus1 = _mm_unpackhi_epi8(minus, _mm_setzero_si128()); | |
2592 | - __m128i diff0 = _mm_sub_epi16(plus0, minus0); | |
2593 | - __m128i diff1 = _mm_sub_epi16(plus1, minus1); | |
2594 | - | |
2595 | - diff0 = shiftAdd16(diff0); | |
2596 | - sum0 = _mm_add_epi16(sum0, diff0); | |
2597 | - sum1 = _mm_shuffle_epi8(sum0, MASK7); | |
2598 | - diff1 = shiftAdd16(diff1); | |
2599 | - sum1 = _mm_add_epi16(sum1, diff1); | |
2600 | - | |
2601 | - __m128i bytes = _mm_packus_epi16(_mm_mulhi_epu16(sum0, invLen), _mm_mulhi_epu16(sum1, invLen)); | |
2602 | - dst[i] = bytes; | |
2603 | - | |
2604 | - plus = nextPlus; | |
2605 | - minus = nextMinus; | |
2606 | - | |
2607 | - } | |
2608 | - | |
2609 | -} | |
2610 | - | |
2611 | -void test_15(const Parameter& p) { | |
2612 | - | |
2613 | - const __m128i* pSrc = (const __m128i*) p.pSrc; | |
2614 | - __m128i* pDst = (__m128i*) p.pDest; | |
2615 | - for (size_t y=0; y<p.height; ++y) { | |
2616 | - hProcess2(pSrc, pDst, p.width, p.radius); | |
2617 | - OffsetPtr(pSrc, p.srcLineOffsetBytes); | |
2618 | - OffsetPtr(pDst, p.destLineOffsetBytes); | |
2619 | - } | |
2620 | -} | |
2621 | - | |
2622 | 1956 | static const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); |
2623 | 1957 | |
2624 | 1958 | static const __m128i MASK7 = { |
@@ -2843,7 +2177,7 @@ | ||
2843 | 2177 | } |
2844 | 2178 | }; |
2845 | 2179 | |
2846 | -void test_16(const Parameter& p) { | |
2180 | +void test_12(const Parameter& p) { | |
2847 | 2181 | |
2848 | 2182 | size_t len = p.radius; |
2849 | 2183 | size_t width = p.width; |
@@ -47,11 +47,7 @@ | ||
47 | 47 | void test_9(const Parameter& p); // memory access further optimization |
48 | 48 | void test_10(const Parameter& p); // test_9 SSE optimization |
49 | 49 | void test_11(const Parameter& p); // fused horizontal & vertical |
50 | -void test_12(const Parameter& p); // fused horizontal to vertical SSE2 | |
51 | -void test_13(const Parameter& p); // fused vertical to horizontal SSE2 | |
52 | -void test_14(const Parameter& p); // fused vertical to horizontal AVX2 | |
53 | -void test_15(const Parameter& p); // horizontal SSSE3 | |
54 | -void test_16(const Parameter& p); // horizontal vertical SSSE3 | |
50 | +void test_12(const Parameter& p); // test_11 SSE3 optimization | |
55 | 51 | |
56 | 52 | // TentFilter |
57 | 53 | void test_20(const Parameter& p); // C implementation |