• R/O
  • SSH
  • HTTPS

imagefilter: Commit


Commit MetaInfo

Révision29 (tree)
l'heure2013-08-04 00:47:46
Auteurberupon

Message de Log

removed some implementations

Change Summary

Modification

--- trunk/main.cpp (revision 28)
+++ trunk/main.cpp (revision 29)
@@ -57,8 +57,8 @@
5757 // const size_t nThreads = 2;
5858 // const size_t nThreads = 4;
5959 #else
60-// const size_t nThreads = si.dwNumberOfProcessors;
61- const size_t nThreads = 1;
60+ const size_t nThreads = si.dwNumberOfProcessors;
61+// const size_t nThreads = 1;
6262 #endif
6363 Threads<blur_1b::Parameter> threads;
6464 threads.SetUp(nThreads);
@@ -106,10 +106,6 @@
106106 blur_1b::test_10,
107107 blur_1b::test_11,
108108 blur_1b::test_12,
109- blur_1b::test_13,
110- blur_1b::test_14,
111- blur_1b::test_15,
112- blur_1b::test_16,
113109
114110 //blur_1b::test_20,
115111 //blur_1b::test_21,
--- trunk/blur_1b.cpp (revision 28)
+++ trunk/blur_1b.cpp (revision 29)
@@ -1940,497 +1940,6 @@
19401940
19411941 }
19421942
1943-static __forceinline
1944-void
1945-repeatShiftSum3(__m128i m01, __m128i& m0, __m128i& m1, __m128i& remain0)
1946-{
1947- m0 = _mm_unpacklo_epi8(m01, _mm_setzero_si128());
1948- m1 = _mm_unpackhi_epi8(m01, _mm_setzero_si128());
1949-#if 1
1950- remain0 = _mm_srli_si128(_mm_add_epi16(m1, _mm_srli_si128(m1, 2)), 12);
1951-
1952- __m128i m01_1 = _mm_slli_si128(m01, 1);
1953- __m128i m01_2 = _mm_slli_si128(m01, 2);
1954- m0 =
1955- _mm_add_epi16(
1956- _mm_add_epi16(
1957- m0,
1958- _mm_unpacklo_epi8(m01_1, _mm_setzero_si128())
1959- ),
1960- _mm_unpacklo_epi8(m01_2, _mm_setzero_si128())
1961- );
1962- m1 =
1963- _mm_add_epi16(
1964- _mm_add_epi16(
1965- m1,
1966- _mm_unpackhi_epi8(m01_1, _mm_setzero_si128())
1967- ),
1968- _mm_unpackhi_epi8(m01_2, _mm_setzero_si128())
1969- );
1970-#else
1971- __m128i s0L = _mm_add_epi16(_mm_add_epi16(m0, _mm_slli_si128(m0, 2)), _mm_slli_si128(m0, 4));
1972- __m128i s1L = _mm_add_epi16(_mm_add_epi16(m1, _mm_slli_si128(m1, 2)), _mm_slli_si128(m1, 4));
1973- __m128i s0R = _mm_srli_si128(_mm_add_epi16(m0, _mm_srli_si128(m0, 2)), 12);
1974- __m128i s1R = _mm_srli_si128(_mm_add_epi16(m1, _mm_srli_si128(m1, 2)), 12);
1975-
1976- m0 = s0L;
1977- m1 = _mm_add_epi16(s1L, s0R);
1978- remain0 = s1R;
1979-#endif
1980-}
1981-
1982-#if _MSC_VER >= 1700
1983-
1984-static __forceinline
1985-void repeatShiftSum3(__m256i m01, __m256i& m0, __m256i& m1, __m256i& remain0)
1986-{
1987- m0 = _mm256_unpacklo_epi8(m01, _mm256_setzero_si256());
1988- m1 = _mm256_unpackhi_epi8(m01, _mm256_setzero_si256());
1989- remain0 = _mm256_srli_si256(_mm256_add_epi16(m1, _mm256_srli_si256(m1, 2)), 28);
1990-
1991- __m256i m01_1 = _mm256_slli_si256(m01, 1);
1992- __m256i m01_2 = _mm256_slli_si256(m01, 2);
1993- m0 =
1994- _mm256_add_epi16(
1995- _mm256_add_epi16(
1996- m0,
1997- _mm256_unpacklo_epi8(m01_1, _mm256_setzero_si256())
1998- ),
1999- _mm256_unpacklo_epi8(m01_2, _mm256_setzero_si256())
2000- );
2001- m1 =
2002- _mm256_add_epi16(
2003- _mm256_add_epi16(
2004- m1,
2005- _mm256_unpackhi_epi8(m01_1, _mm256_setzero_si256())
2006- ),
2007- _mm256_unpackhi_epi8(m01_2, _mm256_setzero_si256())
2008- );
2009-
2010-}
2011-
2012-#endif
2013-
2014-template <size_t SHIFTS>
2015-static __forceinline
2016-void repeatShiftSum(__m128i main01, __m128i& main0, __m128i& main1, __m128i& remain0)
2017-{
2018-
2019- // TODO: efficiently construct slided sum
2020- // 2 ^ 2 ^ 2 = 8
2021- main0 = _mm_unpacklo_epi8(main01, _mm_setzero_si128());
2022- main1 = _mm_unpackhi_epi8(main01, _mm_setzero_si128());
2023-
2024- __m128i m0L = main0;
2025- __m128i m0R = main0;
2026- __m128i m1L = main1;
2027- __m128i m1R = main1;
2028-
2029- __m128i s0L = main0;
2030- __m128i s0R = main0;
2031- __m128i s1L = main1;
2032- __m128i s1R = main1;
2033-
2034- for (size_t i=0; i<SHIFTS-1; ++i) {
2035- m0L = _mm_slli_si128(m0L, 2);
2036- m0R = _mm_srli_si128(m0R, 2);
2037- m1L = _mm_slli_si128(m1L, 2);
2038- m1R = _mm_srli_si128(m1R, 2);
2039- s0L = _mm_add_epi16(s0L, m0L);
2040- s0R = _mm_add_epi16(s0R, m0R);
2041- s1L = _mm_add_epi16(s1L, m1L);
2042- s1R = _mm_add_epi16(s1R, m1R);
2043- }
2044-
2045- main0 = s0L;
2046- s0R = _mm_srli_si128(s0R, (8-(SHIFTS-1))*2);
2047- main1 = _mm_add_epi16(s1L, s0R);
2048- remain0 = _mm_srli_si128(s1R, (8-(SHIFTS-1))*2);;
2049-}
2050-
2051-static __forceinline
2052-void repeatShiftNum(__m128i main01, __m128i& main0, __m128i& main1, __m128i& remain0, size_t count)
2053-{
2054- switch (count) {
2055- case 3:
2056- repeatShiftSum3(main01, main0, main1, remain0);
2057- break;
2058- case 5:
2059- repeatShiftSum<5>(main01, main0, main1, remain0);
2060- break;
2061- case 7:
2062- repeatShiftSum<7>(main01, main0, main1, remain0);
2063- break;
2064- case 9:
2065- repeatShiftSum<9>(main01, main0, main1, remain0);
2066- break;
2067- default:
2068- break;
2069- }
2070-}
2071-
2072-void test_12(const Parameter& p) {
2073-
2074- BLUR_EXTRACT_PARAMS;
2075-
2076- uint32_t hRad = p.radius;
2077- uint32_t vRad = p.radius;
2078- uint32_t hLen = 1 + hRad*2;
2079- uint32_t vLen = 1 + vRad*2;
2080- uint32_t invLen = 0xFFFFFF / (hLen*vLen);
2081- uint32_t hCount = p.width;
2082- uint32_t vCount = p.height;
2083-
2084- const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / (hLen*vLen));
2085-
2086- if (hLen > 9) {
2087- return;
2088- }
2089-
2090- const uint8_t* hLine = p.pSrc;
2091- uint8_t* vLine = p.pDest;
2092- OffsetPtr(vLine, destLineOffsetBytes * vRad);
2093-
2094- uint16_t* vSumLine = (uint16_t*)pWork2;
2095- assert((ptrdiff_t)vSumLine % 16 == 0);
2096- assert((width * 2) % 16 == 0);
2097-
2098- RingLinePtr<uint16_t*> vMinusLine(vLen, 0, (uint16_t*)pWork, width*2);
2099- RingLinePtr<uint16_t*> vPlusLine(vLen, 0, (uint16_t*)pWork, width*2);
2100-
2101- // vTop collect
2102- {
2103- const uint8_t* hMinus = hLine;
2104- const uint8_t* hPlus = hLine+hLen;
2105- size_t hSum = 0;
2106- // hLeft collect
2107- for (size_t x=0; x<hLen; ++x) {
2108- hSum += hLine[x];
2109- }
2110- // hCenter
2111- for (size_t x=hRad; x<hCount-hRad; ++x) {
2112- hSum -= *hMinus++;
2113- hSum += *hPlus++;
2114- vPlusLine[x] = hSum;
2115- vSumLine[x] = hSum;
2116- }
2117- // hRight
2118- ;
2119- OffsetPtr(hLine, srcLineOffsetBytes);
2120- vPlusLine.moveNext();
2121- }
2122- for (size_t y=1; y<vLen; ++y) {
2123- const uint8_t* hMinus = hLine;
2124- const uint8_t* hPlus = hLine+hLen;
2125- size_t hSum = 0;
2126- // hLeft collect
2127- for (size_t x=0; x<hLen; ++x) {
2128- hSum += hLine[x];
2129- }
2130- // hCenter
2131- for (size_t x=hRad; x<hCount-hRad; ++x) {
2132- hSum -= *hMinus++;
2133- hSum += *hPlus++;
2134- vPlusLine[x] = hSum;
2135- vSumLine[x] += hSum;
2136- }
2137- // hRight
2138- ;
2139- OffsetPtr(hLine, srcLineOffsetBytes);
2140- vPlusLine.moveNext();
2141- }
2142-
2143- __m128i* mvSumLine = (__m128i*)vSumLine;
2144-
2145- // vMiddle
2146- for (size_t y=vRad; y<vCount-vLen; ++y) {
2147-
2148- assert((ptrdiff_t)hLine % 16 == 0);
2149-
2150- const __m128i* mhLine = (const __m128i*)hLine;
2151- __m128i m01 = mhLine[0];
2152- __m128i m0 = _mm_unpacklo_epi8(m01, _mm_setzero_si128());
2153- __m128i m1 = _mm_unpackhi_epi8(m01, _mm_setzero_si128());
2154- __m128i m23 = mhLine[1];
2155-
2156- // hLeft collect
2157- __m128i m1r;
2158-
2159- __m128i* mvLine = (__m128i*) vLine;
2160- __m128i* mvMinusLine = vMinusLine;
2161- __m128i* mvPlusLine = vMinusLine;
2162- __m128i m0l = _mm_setzero_si128();
2163-
2164- // hCenter
2165- const size_t loopCount = hCount / 16;
2166- for (size_t i=0; i<loopCount; ++i) {
2167- repeatShiftNum(mhLine[i], m0, m1, m1r, hLen);
2168- m0 = _mm_add_epi16(m0l, m0);
2169- __m128i sum0 = mvSumLine[i*2+0];
2170- __m128i sum1 = mvSumLine[i*2+1];
2171- __m128i minus0 = mvMinusLine[i*2+0];
2172- __m128i minus1 = mvMinusLine[i*2+1];
2173- sum0 = _mm_sub_epi16(sum0, minus0);
2174- sum1 = _mm_sub_epi16(sum1, minus1);
2175- mvPlusLine[i*2+0] = m0;
2176- mvPlusLine[i*2+1] = m1;
2177- sum0 = _mm_add_epi16(sum0, m0);
2178- sum1 = _mm_add_epi16(sum1, m1);
2179- mvSumLine[i*2+0] = sum0;
2180- mvSumLine[i*2+1] = sum1;
2181-
2182- _mm_stream_si128(mvLine+i, _mm_packus_epi16(
2183- _mm_mulhi_epu16(sum0, mInvRatio),
2184- _mm_mulhi_epu16(sum1, mInvRatio)
2185- ));
2186- m0l = m1r;
2187- }
2188- // hRight
2189- ;
2190- OffsetPtr(hLine, srcLineOffsetBytes);
2191- OffsetPtr(vLine, destLineOffsetBytes);
2192- vMinusLine.moveNext();
2193- vPlusLine.moveNext();
2194- }
2195-
2196-}
2197-
2198-void test_13(const Parameter& p) {
2199-
2200- BLUR_EXTRACT_PARAMS;
2201-
2202- uint32_t hRad = p.radius;
2203- uint32_t vRad = p.radius;
2204- uint32_t hLen = 1 + hRad*2;
2205- uint32_t vLen = 1 + vRad*2;
2206- uint32_t invLen = 0xFFFFFF / (hLen*vLen);
2207- uint32_t hCount = p.width;
2208- uint32_t vCount = p.height;
2209-
2210- static const __m128i mInvRatio = _mm_set1_epi16(0xFFFF / 9);
2211-
2212- if (vRad != 1) {
2213- return;
2214- }
2215-
2216- const uint8_t* hLine = p.pSrc;
2217- uint8_t* vLine = p.pDest;
2218- OffsetPtr(vLine, destLineOffsetBytes * vRad);
2219-
2220- uint16_t* vSumLine = (uint16_t*)pWork2;
2221- assert((ptrdiff_t)vSumLine % 16 == 0);
2222- assert((width * 2) % 16 == 0);
2223-
2224- int* remains = (int*)p.pWork;
2225-#if 1
2226- const __m128i* mpSrc = (const __m128i*)pSrc;
2227- __m128i* mpDst = (__m128i*)pDest;
2228- for (size_t i=0; i<hCount/64; ++i) {
2229- const __m128i* src = mpSrc;
2230- __m128i* dst = mpDst;
2231- __m128i sums0 = _mm_setzero_si128();
2232- __m128i sums1 = _mm_setzero_si128();
2233- __m128i sums2 = _mm_setzero_si128();
2234- __m128i sums3 = _mm_setzero_si128();
2235- __m128i sums4 = _mm_setzero_si128();
2236- __m128i sums5 = _mm_setzero_si128();
2237- __m128i sums6 = _mm_setzero_si128();
2238- __m128i sums7 = _mm_setzero_si128();
2239- __m128i adds0 = _mm_setzero_si128();
2240- __m128i adds1 = _mm_setzero_si128();
2241- __m128i adds2 = _mm_setzero_si128();
2242- __m128i adds3 = _mm_setzero_si128();
2243- __m128i adds4 = _mm_setzero_si128();
2244- __m128i adds5 = _mm_setzero_si128();
2245- __m128i adds6 = _mm_setzero_si128();
2246- __m128i adds7 = _mm_setzero_si128();
2247- __m128i mids0 = _mm_setzero_si128();
2248- __m128i mids1 = _mm_setzero_si128();
2249- __m128i mids2 = _mm_setzero_si128();
2250- __m128i mids3 = _mm_setzero_si128();
2251- __m128i mids4 = _mm_setzero_si128();
2252- __m128i mids5 = _mm_setzero_si128();
2253- __m128i mids6 = _mm_setzero_si128();
2254- __m128i mids7 = _mm_setzero_si128();
2255- __m128i remain0 = _mm_setzero_si128();
2256- __m128i remain1 = _mm_setzero_si128();
2257- __m128i remain2 = _mm_setzero_si128();
2258- __m128i nsrc0 = src[0];
2259- __m128i nsrc1 = src[1];
2260- __m128i nsrc2 = src[2];
2261- __m128i nsrc3 = src[3];
2262- for (size_t y=0; y<vCount; ++y) {
2263- sums0 = _mm_sub_epi16(sums0, mids0);
2264- sums1 = _mm_sub_epi16(sums1, mids1);
2265- sums2 = _mm_sub_epi16(sums2, mids2);
2266- sums3 = _mm_sub_epi16(sums3, mids3);
2267- sums4 = _mm_sub_epi16(sums4, mids4);
2268- sums5 = _mm_sub_epi16(sums5, mids5);
2269- sums6 = _mm_sub_epi16(sums6, mids6);
2270- sums7 = _mm_sub_epi16(sums7, mids7);
2271-
2272- mids0 = adds0;
2273- mids1 = adds1;
2274- mids2 = adds2;
2275- mids3 = adds3;
2276- mids4 = adds4;
2277- mids5 = adds5;
2278- mids6 = adds6;
2279- mids7 = adds7;
2280-
2281- __m128i src0 = nsrc0;
2282- __m128i src1 = nsrc1;
2283- __m128i src2 = nsrc2;
2284- __m128i src3 = nsrc3;
2285- nsrc0 = src[4];
2286- nsrc1 = src[5];
2287- nsrc2 = src[6];
2288- nsrc3 = src[7];
2289-
2290- __m128i remain = _mm_cvtsi32_si128(remains[y]);
2291- repeatShiftSum3(src0, adds0, adds1, remain0);
2292- adds0 = _mm_add_epi16(adds0, remain);
2293- repeatShiftSum3(src1, adds2, adds3, remain1);
2294- repeatShiftSum3(src2, adds4, adds5, remain2);
2295- repeatShiftSum3(src3, adds6, adds7, remain);
2296- remains[y] = _mm_cvtsi128_si32(remain);
2297- adds2 = _mm_add_epi16(adds2, remain0);
2298- adds4 = _mm_add_epi16(adds4, remain1);
2299- adds6 = _mm_add_epi16(adds6, remain2);
2300-
2301- sums0 = _mm_add_epi16(sums0, adds0);
2302- sums1 = _mm_add_epi16(sums1, adds1);
2303- sums2 = _mm_add_epi16(sums2, adds2);
2304- sums3 = _mm_add_epi16(sums3, adds3);
2305- sums4 = _mm_add_epi16(sums4, adds4);
2306- sums5 = _mm_add_epi16(sums5, adds5);
2307- sums6 = _mm_add_epi16(sums6, adds6);
2308- sums7 = _mm_add_epi16(sums7, adds7);
2309-
2310- __m128i result0 = _mm_packus_epi16(_mm_mulhi_epu16(sums0, mInvRatio), _mm_mulhi_epu16(sums1, mInvRatio));
2311- __m128i result1 = _mm_packus_epi16(_mm_mulhi_epu16(sums2, mInvRatio), _mm_mulhi_epu16(sums3, mInvRatio));
2312- __m128i result2 = _mm_packus_epi16(_mm_mulhi_epu16(sums4, mInvRatio), _mm_mulhi_epu16(sums5, mInvRatio));
2313- __m128i result3 = _mm_packus_epi16(_mm_mulhi_epu16(sums6, mInvRatio), _mm_mulhi_epu16(sums7, mInvRatio));
2314- _mm_stream_si128(dst+0, result0);
2315- _mm_stream_si128(dst+1, result1);
2316- _mm_stream_si128(dst+2, result2);
2317- _mm_stream_si128(dst+3, result3);
2318- OffsetPtr(dst, destLineOffsetBytes);
2319- OffsetPtr(src, destLineOffsetBytes);
2320- }
2321- mpSrc += 4;
2322- mpDst += 4;
2323- }
2324-#else
2325- memcpy(pDest, pSrc, vCount*hCount);
2326- //__m256i* __restrict mpSrc = (__m256i* __restrict)&pSrc[0];
2327- //__m256i* __restrict mpDst = (__m256i* __restrict)&pDest[0];
2328- //for (size_t i=0; i<vCount*hCount/64; ++i) {
2329- // _mm256_stream_si256(&mpDst[i*2+0], _mm256_load_si256(&mpSrc[i*2+0]));
2330- // _mm256_stream_si256(&mpDst[i*2+1], _mm256_load_si256(&mpSrc[i*2+1]));
2331- //}
2332-#endif
2333-
2334-}
2335-
2336-void test_14(const Parameter& p) {
2337-#if _MSC_VER >= 1700
2338-
2339- BLUR_EXTRACT_PARAMS;
2340-
2341- uint32_t hRad = p.radius;
2342- uint32_t vRad = p.radius;
2343- uint32_t hLen = 1 + hRad*2;
2344- uint32_t vLen = 1 + vRad*2;
2345- uint32_t invLen = 0xFFFFFF / (hLen*vLen);
2346- uint32_t hCount = p.width;
2347- uint32_t vCount = p.height;
2348-
2349- _mm256_zeroall();
2350-
2351- static const __m256i mInvRatio = _mm256_set1_epi16(0xFFFF / 9);
2352-
2353- if (vRad != 1) {
2354- return;
2355- }
2356-
2357- const uint8_t* hLine = p.pSrc;
2358- uint8_t* vLine = p.pDest;
2359- OffsetPtr(vLine, destLineOffsetBytes * vRad);
2360-
2361- uint16_t* vSumLine = (uint16_t*)pWork2;
2362- assert((ptrdiff_t)vSumLine % 16 == 0);
2363- assert((width * 2) % 16 == 0);
2364-
2365- int* remains = (int*)p.pWork;
2366- const __m256i* mpSrc = (const __m256i*)pSrc;
2367- __m256i* mpDst = (__m256i*)pDest;
2368- for (size_t i=0; i<width/64; ++i) {
2369- const __m256i* src = mpSrc;
2370- __m256i* dst = mpDst;
2371- __m256i sums0 = _mm256_setzero_si256();
2372- __m256i sums1 = _mm256_setzero_si256();
2373- __m256i sums2 = _mm256_setzero_si256();
2374- __m256i sums3 = _mm256_setzero_si256();
2375- __m256i adds0 = _mm256_setzero_si256();
2376- __m256i adds1 = _mm256_setzero_si256();
2377- __m256i adds2 = _mm256_setzero_si256();
2378- __m256i adds3 = _mm256_setzero_si256();
2379- __m256i mids0 = _mm256_setzero_si256();
2380- __m256i mids1 = _mm256_setzero_si256();
2381- __m256i mids2 = _mm256_setzero_si256();
2382- __m256i mids3 = _mm256_setzero_si256();
2383- __m256i remain0 = _mm256_setzero_si256();
2384- __m256i remain1 = _mm256_setzero_si256();
2385- __m256i remain2 = _mm256_setzero_si256();
2386- __m256i nsrc0 = src[0];
2387- __m256i nsrc1 = src[1];
2388- for (size_t y=0; y<vCount; ++y) {
2389- sums0 = _mm256_sub_epi16(sums0, mids0);
2390- sums1 = _mm256_sub_epi16(sums1, mids1);
2391- sums2 = _mm256_sub_epi16(sums2, mids2);
2392- sums3 = _mm256_sub_epi16(sums3, mids3);
2393-
2394- mids0 = adds0;
2395- mids1 = adds1;
2396- mids2 = adds2;
2397- mids3 = adds3;
2398-
2399- __m256i src0 = nsrc0;
2400- __m256i src1 = nsrc1;
2401- nsrc0 = src[2];
2402- nsrc1 = src[3];
2403-
2404- __m256i remain = _mm256_castsi128_si256(_mm_cvtsi32_si128(remains[y]));
2405- repeatShiftSum3(src0, adds0, adds1, remain0);
2406- adds0 = _mm256_add_epi16(adds0, remain);
2407- repeatShiftSum3(src1, adds2, adds3, remain1);
2408- remains[y] = _mm_cvtsi128_si32(_mm256_castsi256_si128(remain));
2409- adds2 = _mm256_add_epi16(adds2, remain0);
2410-
2411- sums0 = _mm256_add_epi16(sums0, adds0);
2412- sums1 = _mm256_add_epi16(sums1, adds1);
2413- sums2 = _mm256_add_epi16(sums2, adds2);
2414- sums3 = _mm256_add_epi16(sums3, adds3);
2415-
2416- __m256i result0 = _mm256_packus_epi16(_mm256_mulhi_epu16(sums0, mInvRatio), _mm256_mulhi_epu16(sums1, mInvRatio));
2417- __m256i result1 = _mm256_packus_epi16(_mm256_mulhi_epu16(sums2, mInvRatio), _mm256_mulhi_epu16(sums3, mInvRatio));
2418-#if 1
2419- _mm256_stream_si256(dst+0, result0);
2420- _mm256_stream_si256(dst+1, result1);
2421-#else
2422- dst[0] = result0;
2423- dst[1] = result1;
2424-#endif
2425- OffsetPtr(dst, destLineOffsetBytes);
2426- OffsetPtr(src, destLineOffsetBytes);
2427- }
2428- mpSrc += 2;
2429- mpDst += 2;
2430- }
2431-#endif
2432-}
2433-
24341943 __forceinline
24351944 __m128i shiftAdd16(__m128i v)
24361945 {
@@ -2444,181 +1953,6 @@
24441953 return v;
24451954 }
24461955
2447-__forceinline
2448-void hProcess(const __m128i* src, __m128i* dst, size_t width, uint8_t len)
2449-{
2450- // 現在の実装には制限有り
2451- if (len > 7 || len == 0) {
2452- return;
2453- }
2454- size_t vcnt = width / 16;
2455- if (vcnt == 0) {
2456- return;
2457- }
2458- // 左側画面外の要素を反転して生成
2459- __m128i prev;
2460- __m128i cur;
2461- __m128i sum0 = _mm_setzero_si128();
2462-
2463- cur = src[0]; // 0-15
2464- const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
2465- prev = _mm_shuffle_epi8(cur, REVERSE); // 15-1
2466-
2467- // shift masks
2468- static const __m128i M[7*2] = {
2469- // 1
2470- {13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev
2471- {-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,+9,10,11,12,}, // cur
2472- // 2
2473- {11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev
2474- {-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,+9,10,}, // cur
2475- // 3
2476- {+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,}, // prev
2477- {-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,+7,+8,}, // cur
2478- // 4
2479- {+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,}, // prev
2480- {-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,+5,+6,}, // cur
2481- // 5
2482- {+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,-1,-1,}, // prev
2483- {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,+3,+4,}, // cur
2484- // 6
2485- {+3,+4,+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,-1,-1,}, // prev
2486- {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,+1,+2,}, // cur
2487- // 7
2488- {+1,+2,+3,+4,+5,+6,+7,+8,+9,10,11,12,13,14,15,-1,}, // prev
2489- {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,+0,}, // cur
2490- };
2491- const size_t baseIdx = (len - 1) * 2;
2492- const __m128i prevMask = M[baseIdx+0];
2493- const __m128i curMask = M[baseIdx+1];
2494-
2495- const __m128i invLen = _mm_set1_epi16(0xFFFF / (1+len*2));
2496- const __m128i mask7 = {
2497- 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
2498- };
2499-
2500- uint16_t sum = 0;
2501- for (size_t i=0; i<1+len*2; ++i) {
2502- sum += ((const uint8_t*)src)[1+i];
2503- }
2504- sum0 = _mm_set1_epi16(sum);
2505-
2506- for (size_t i=0; i<vcnt; ++i) {
2507- __m128i next = src[i+1];
2508- __m128i prev2 = _mm_shuffle_epi8(prev, prevMask);
2509- __m128i cur2 = _mm_shuffle_epi8(cur, curMask);
2510- __m128i minus = _mm_or_si128(prev2, cur2);
2511-
2512- __m128i plus0 = _mm_unpacklo_epi8(cur, _mm_setzero_si128());
2513- __m128i plus1 = _mm_unpackhi_epi8(cur, _mm_setzero_si128());
2514- __m128i minus0 = _mm_unpacklo_epi8(minus, _mm_setzero_si128());
2515- __m128i minus1 = _mm_unpackhi_epi8(minus, _mm_setzero_si128());
2516- __m128i diff0 = _mm_sub_epi16(plus0, minus0);
2517- __m128i diff1 = _mm_sub_epi16(plus1, minus1);
2518-
2519- diff0 = shiftAdd16(diff0);
2520- sum0 = _mm_add_epi16(sum0, diff0);
2521-
2522- __m128i sum1 = _mm_shuffle_epi8(sum0, mask7);
2523- diff1 = shiftAdd16(diff1);
2524- sum1 = _mm_add_epi16(sum1, diff1);
2525-
2526- __m128i bytes = _mm_packus_epi16(_mm_mulhi_epu16(sum0, invLen), _mm_mulhi_epu16(sum1, invLen));
2527-// _mm_stream_si128(dst+i, bytes);
2528- dst[i] = bytes;
2529-// dst[i] = src[i];
2530- prev = cur;
2531- cur = next;
2532- sum0 = _mm_shuffle_epi8(sum1, mask7);
2533- }
2534-
2535-}
2536-
2537-/*
2538-#if 0
2539- __m128i sum0 = _mm_shufflehi_epi16(sum1, _MM_SHUFFLE(3,3,3,3));
2540- sum0 = _mm_unpackhi_epi64(sum0, sum0);
2541-#else
2542- __m128i sum0 = _mm_shuffle_epi8(sum1, MASK7);
2543-#endif
2544-
2545-*/
2546-
2547-__forceinline
2548-void hProcess2(const __m128i* src, __m128i* dst, size_t width, uint8_t len)
2549-{
2550- // 現在の実装には制限有り
2551- if (len > 14 || len == 0) {
2552- return;
2553- }
2554- size_t vcnt = width / 16;
2555- if (vcnt == 0) {
2556- return;
2557- }
2558-
2559- static const __m128i MASK7 = {
2560- 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
2561- };
2562- const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
2563- const __m128i invLen = _mm_set1_epi16(0xFFFF / (1+len*2));
2564-
2565- // 反転して要素を生成
2566- __m128i minusSrc2[2];
2567- minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1
2568- minusSrc2[1] = src[0];
2569- uint16_t sum = 0;
2570- for (size_t i=0; i<1+len*2; ++i) {
2571- sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)];
2572- }
2573- __m128i sum1 = _mm_set1_epi16(sum);
2574- const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len);
2575- const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1));
2576- __m128i plus = _mm_loadu_si128(plusSrc);
2577- __m128i minus = _mm_loadu_si128(
2578- (const __m128i*) (
2579- (const uint8_t*)(minusSrc2+1) - (len+1)
2580- )
2581- );
2582-
2583- for (size_t i=0; i<vcnt; ++i) {
2584- __m128i nextPlus = _mm_loadu_si128(plusSrc+i+1);
2585- __m128i nextMinus = _mm_loadu_si128(minusSrc+i+1);
2586-
2587- __m128i sum0 = _mm_shuffle_epi8(sum1, MASK7);
2588- __m128i plus0 = _mm_unpacklo_epi8(plus, _mm_setzero_si128());
2589- __m128i minus0 = _mm_unpacklo_epi8(minus, _mm_setzero_si128());
2590- __m128i plus1 = _mm_unpackhi_epi8(plus, _mm_setzero_si128());
2591- __m128i minus1 = _mm_unpackhi_epi8(minus, _mm_setzero_si128());
2592- __m128i diff0 = _mm_sub_epi16(plus0, minus0);
2593- __m128i diff1 = _mm_sub_epi16(plus1, minus1);
2594-
2595- diff0 = shiftAdd16(diff0);
2596- sum0 = _mm_add_epi16(sum0, diff0);
2597- sum1 = _mm_shuffle_epi8(sum0, MASK7);
2598- diff1 = shiftAdd16(diff1);
2599- sum1 = _mm_add_epi16(sum1, diff1);
2600-
2601- __m128i bytes = _mm_packus_epi16(_mm_mulhi_epu16(sum0, invLen), _mm_mulhi_epu16(sum1, invLen));
2602- dst[i] = bytes;
2603-
2604- plus = nextPlus;
2605- minus = nextMinus;
2606-
2607- }
2608-
2609-}
2610-
2611-void test_15(const Parameter& p) {
2612-
2613- const __m128i* pSrc = (const __m128i*) p.pSrc;
2614- __m128i* pDst = (__m128i*) p.pDest;
2615- for (size_t y=0; y<p.height; ++y) {
2616- hProcess2(pSrc, pDst, p.width, p.radius);
2617- OffsetPtr(pSrc, p.srcLineOffsetBytes);
2618- OffsetPtr(pDst, p.destLineOffsetBytes);
2619- }
2620-}
2621-
26221956 static const __m128i REVERSE = _mm_setr_epi8(-1,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
26231957
26241958 static const __m128i MASK7 = {
@@ -2843,7 +2177,7 @@
28432177 }
28442178 };
28452179
2846-void test_16(const Parameter& p) {
2180+void test_12(const Parameter& p) {
28472181
28482182 size_t len = p.radius;
28492183 size_t width = p.width;
--- trunk/blur_1b.h (revision 28)
+++ trunk/blur_1b.h (revision 29)
@@ -47,11 +47,7 @@
4747 void test_9(const Parameter& p); // memory access further optimization
4848 void test_10(const Parameter& p); // test_9 SSE optimization
4949 void test_11(const Parameter& p); // fused horizontal & vertical
50-void test_12(const Parameter& p); // fused horizontal to vertical SSE2
51-void test_13(const Parameter& p); // fused vertical to horizontal SSE2
52-void test_14(const Parameter& p); // fused vertical to horizontal AVX2
53-void test_15(const Parameter& p); // horizontal SSSE3
54-void test_16(const Parameter& p); // horizontal vertical SSSE3
50+void test_12(const Parameter& p); // test_11 SSE3 optimization
5551
5652 // TentFilter
5753 void test_20(const Parameter& p); // C implementation
Afficher sur ancien navigateur de dépôt.