blob: 67e571c88419768ee12e7f6351682b5f35956776 [file] [log] [blame]
[email protected]02a9ac82011-02-11 21:20:131// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#if defined(_MSC_VER)
6#include <intrin.h>
7#else
8#include <mmintrin.h>
9#include <emmintrin.h>
10#endif
11
12#include "remoting/host/differ_block.h"
13#include "remoting/host/differ_block_internal.h"
14
15namespace remoting {
16
17extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2,
18 int stride) {
19 __m128i acc = _mm_setzero_si128();
20 __m128i v0;
21 __m128i v1;
22 __m128i sad;
[email protected]d8fe8f12011-08-04 20:09:1223 for (int y = 0; y < kBlockSize; ++y) {
[email protected]02a9ac82011-02-11 21:20:1324 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
25 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
26 v0 = _mm_loadu_si128(i1);
27 v1 = _mm_loadu_si128(i2);
28 sad = _mm_sad_epu8(v0, v1);
29 acc = _mm_adds_epu16(acc, sad);
30 v0 = _mm_loadu_si128(i1 + 1);
31 v1 = _mm_loadu_si128(i2 + 1);
32 sad = _mm_sad_epu8(v0, v1);
33 acc = _mm_adds_epu16(acc, sad);
34 v0 = _mm_loadu_si128(i1 + 2);
35 v1 = _mm_loadu_si128(i2 + 2);
36 sad = _mm_sad_epu8(v0, v1);
37 acc = _mm_adds_epu16(acc, sad);
38 v0 = _mm_loadu_si128(i1 + 3);
39 v1 = _mm_loadu_si128(i2 + 3);
40 sad = _mm_sad_epu8(v0, v1);
41 acc = _mm_adds_epu16(acc, sad);
42
43 // This essential means sad = acc >> 64. We only care about the lower 16
44 // bits.
45 sad = _mm_shuffle_epi32(acc, 0xEE);
46 sad = _mm_adds_epu16(sad, acc);
47 int diff = _mm_cvtsi128_si32(sad);
48 if (diff)
49 return 1;
50 image1 += stride;
51 image2 += stride;
52 }
53 return 0;
54}
55
56extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2,
57 int stride) {
58 __m128i acc = _mm_setzero_si128();
59 __m128i v0;
60 __m128i v1;
61 __m128i sad;
[email protected]d8fe8f12011-08-04 20:09:1262 for (int y = 0; y < kBlockSize; ++y) {
[email protected]02a9ac82011-02-11 21:20:1363 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
64 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
65 v0 = _mm_loadu_si128(i1);
66 v1 = _mm_loadu_si128(i2);
67 sad = _mm_sad_epu8(v0, v1);
68 acc = _mm_adds_epu16(acc, sad);
69 v0 = _mm_loadu_si128(i1 + 1);
70 v1 = _mm_loadu_si128(i2 + 1);
71 sad = _mm_sad_epu8(v0, v1);
72 acc = _mm_adds_epu16(acc, sad);
73 v0 = _mm_loadu_si128(i1 + 2);
74 v1 = _mm_loadu_si128(i2 + 2);
75 sad = _mm_sad_epu8(v0, v1);
76 acc = _mm_adds_epu16(acc, sad);
77 v0 = _mm_loadu_si128(i1 + 3);
78 v1 = _mm_loadu_si128(i2 + 3);
79 sad = _mm_sad_epu8(v0, v1);
80 acc = _mm_adds_epu16(acc, sad);
81 v0 = _mm_loadu_si128(i1 + 4);
82 v1 = _mm_loadu_si128(i2 + 4);
83 sad = _mm_sad_epu8(v0, v1);
84 acc = _mm_adds_epu16(acc, sad);
85 v0 = _mm_loadu_si128(i1 + 5);
86 v1 = _mm_loadu_si128(i2 + 5);
87 sad = _mm_sad_epu8(v0, v1);
88 acc = _mm_adds_epu16(acc, sad);
89 v0 = _mm_loadu_si128(i1 + 6);
90 v1 = _mm_loadu_si128(i2 + 6);
91 sad = _mm_sad_epu8(v0, v1);
92 acc = _mm_adds_epu16(acc, sad);
93 v0 = _mm_loadu_si128(i1 + 7);
94 v1 = _mm_loadu_si128(i2 + 7);
95 sad = _mm_sad_epu8(v0, v1);
96 acc = _mm_adds_epu16(acc, sad);
97
98 // This essential means sad = acc >> 64. We only care about the lower 16
99 // bits.
100 sad = _mm_shuffle_epi32(acc, 0xEE);
101 sad = _mm_adds_epu16(sad, acc);
102 int diff = _mm_cvtsi128_si32(sad);
103 if (diff)
104 return 1;
105 image1 += stride;
106 image2 += stride;
107 }
108 return 0;
109}
110
111} // namespace remoting