Blame - skia/ext/convolver_SSE2.cc - chromium/src.git

blob: a823edcb519291d89ed79c448efa98f9569bb788 [file] [log] [blame]

[email protected]	c1ca658	2013-04-09 00:32:02	[diff] [blame^]	1	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include <algorithm>
				6
				7	#include "skia/ext/convolver.h"
				8	#include "skia/ext/convolver_SSE2.h"
				9	#include "third_party/skia/include/core/SkTypes.h"
				10
				11	#include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
				12
				13	namespace skia {
				14
				15	// Convolves horizontally along a single row. The row data is given in
				16	// \|src_data\| and continues for the num_values() of the filter.
				17	void ConvolveHorizontally_SSE2(const unsigned char* src_data,
				18	const ConvolutionFilter1D& filter,
				19	unsigned char* out_row) {
				20	int num_values = filter.num_values();
				21
				22	int filter_offset, filter_length;
				23	__m128i zero = _mm_setzero_si128();
				24	__m128i mask[4];
				25	// \|mask\| will be used to decimate all extra filter coefficients that are
				26	// loaded by SIMD when \|filter_length\| is not divisible by 4.
				27	// mask[0] is not used in following algorithm.
				28	mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
				29	mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
				30	mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
				31
				32	// Output one pixel each iteration, calculating all channels (RGBA) together.
				33	for (int out_x = 0; out_x < num_values; out_x++) {
				34	const ConvolutionFilter1D::Fixed* filter_values =
				35	filter.FilterForValue(out_x, &filter_offset, &filter_length);
				36
				37	__m128i accum = _mm_setzero_si128();
				38
				39	// Compute the first pixel in this row that the filter affects. It will
				40	// touch \|filter_length\| pixels (4 bytes each) after this.
				41	const __m128i* row_to_filter =
				42	reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
				43
				44	// We will load and accumulate with four coefficients per iteration.
				45	for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
				46
				47	// Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
				48	__m128i coeff, coeff16;
				49	// [16] xx xx xx xx c3 c2 c1 c0
				50	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				51	// [16] xx xx xx xx c1 c1 c0 c0
				52	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				53	// [16] c1 c1 c1 c1 c0 c0 c0 c0
				54	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				55
				56	// Load four pixels => unpack the first two pixels to 16 bits =>
				57	// multiply with coefficients => accumulate the convolution result.
				58	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				59	__m128i src8 = _mm_loadu_si128(row_to_filter);
				60	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				61	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				62	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				63	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				64	// [32] a0c0 b0c0 g0c0 r0c0
				65	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				66	accum = _mm_add_epi32(accum, t);
				67	// [32] a1c1 b1c1 g1c1 r1c1
				68	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				69	accum = _mm_add_epi32(accum, t);
				70
				71	// Duplicate 3rd and 4th coefficients for all channels =>
				72	// unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
				73	// => accumulate the convolution results.
				74	// [16] xx xx xx xx c3 c3 c2 c2
				75	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				76	// [16] c3 c3 c3 c3 c2 c2 c2 c2
				77	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				78	// [16] a3 g3 b3 r3 a2 g2 b2 r2
				79	src16 = _mm_unpackhi_epi8(src8, zero);
				80	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				81	mul_lo = _mm_mullo_epi16(src16, coeff16);
				82	// [32] a2c2 b2c2 g2c2 r2c2
				83	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				84	accum = _mm_add_epi32(accum, t);
				85	// [32] a3c3 b3c3 g3c3 r3c3
				86	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				87	accum = _mm_add_epi32(accum, t);
				88
				89	// Advance the pixel and coefficients pointers.
				90	row_to_filter += 1;
				91	filter_values += 4;
				92	}
				93
				94	// When \|filter_length\| is not divisible by 4, we need to decimate some of
				95	// the filter coefficient that was loaded incorrectly to zero; Other than
				96	// that the algorithm is same with above, exceot that the 4th pixel will be
				97	// always absent.
				98	int r = filter_length&3;
				99	if (r) {
				100	// Note: filter_values must be padded to align_up(filter_offset, 8).
				101	__m128i coeff, coeff16;
				102	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				103	// Mask out extra filter taps.
				104	coeff = _mm_and_si128(coeff, mask[r]);
				105	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				106	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				107
				108	// Note: line buffer must be padded to align_up(filter_offset, 16).
				109	// We resolve this by use C-version for the last horizontal line.
				110	__m128i src8 = _mm_loadu_si128(row_to_filter);
				111	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				112	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				113	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				114	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				115	accum = _mm_add_epi32(accum, t);
				116	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				117	accum = _mm_add_epi32(accum, t);
				118
				119	src16 = _mm_unpackhi_epi8(src8, zero);
				120	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				121	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				122	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				123	mul_lo = _mm_mullo_epi16(src16, coeff16);
				124	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				125	accum = _mm_add_epi32(accum, t);
				126	}
				127
				128	// Shift right for fixed point implementation.
				129	accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
				130
				131	// Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).
				132	accum = _mm_packs_epi32(accum, zero);
				133	// Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).
				134	accum = _mm_packus_epi16(accum, zero);
				135
				136	// Store the pixel value of 32 bits.
				137	(reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);
				138	out_row += 4;
				139	}
				140	}
				141
				142	// Convolves horizontally along four rows. The row data is given in
				143	// \|src_data\| and continues for the num_values() of the filter.
				144	// The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please
				145	// refer to that function for detailed comments.
				146	void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
				147	const ConvolutionFilter1D& filter,
				148	unsigned char* out_row[4]) {
				149	int num_values = filter.num_values();
				150
				151	int filter_offset, filter_length;
				152	__m128i zero = _mm_setzero_si128();
				153	__m128i mask[4];
				154	// \|mask\| will be used to decimate all extra filter coefficients that are
				155	// loaded by SIMD when \|filter_length\| is not divisible by 4.
				156	// mask[0] is not used in following algorithm.
				157	mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
				158	mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
				159	mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
				160
				161	// Output one pixel each iteration, calculating all channels (RGBA) together.
				162	for (int out_x = 0; out_x < num_values; out_x++) {
				163	const ConvolutionFilter1D::Fixed* filter_values =
				164	filter.FilterForValue(out_x, &filter_offset, &filter_length);
				165
				166	// four pixels in a column per iteration.
				167	__m128i accum0 = _mm_setzero_si128();
				168	__m128i accum1 = _mm_setzero_si128();
				169	__m128i accum2 = _mm_setzero_si128();
				170	__m128i accum3 = _mm_setzero_si128();
				171	int start = (filter_offset<<2);
				172	// We will load and accumulate with four coefficients per iteration.
				173	for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
				174	__m128i coeff, coeff16lo, coeff16hi;
				175	// [16] xx xx xx xx c3 c2 c1 c0
				176	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				177	// [16] xx xx xx xx c1 c1 c0 c0
				178	coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				179	// [16] c1 c1 c1 c1 c0 c0 c0 c0
				180	coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
				181	// [16] xx xx xx xx c3 c3 c2 c2
				182	coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				183	// [16] c3 c3 c3 c3 c2 c2 c2 c2
				184	coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
				185
				186	__m128i src8, src16, mul_hi, mul_lo, t;
				187
				188	#define ITERATION(src, accum) \
				189	src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
				190	src16 = _mm_unpacklo_epi8(src8, zero); \
				191	mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
				192	mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
				193	t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
				194	accum = _mm_add_epi32(accum, t); \
				195	t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
				196	accum = _mm_add_epi32(accum, t); \
				197	src16 = _mm_unpackhi_epi8(src8, zero); \
				198	mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
				199	mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
				200	t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
				201	accum = _mm_add_epi32(accum, t); \
				202	t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
				203	accum = _mm_add_epi32(accum, t)
				204
				205	ITERATION(src_data[0] + start, accum0);
				206	ITERATION(src_data[1] + start, accum1);
				207	ITERATION(src_data[2] + start, accum2);
				208	ITERATION(src_data[3] + start, accum3);
				209
				210	start += 16;
				211	filter_values += 4;
				212	}
				213
				214	int r = filter_length & 3;
				215	if (r) {
				216	// Note: filter_values must be padded to align_up(filter_offset, 8);
				217	__m128i coeff;
				218	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				219	// Mask out extra filter taps.
				220	coeff = _mm_and_si128(coeff, mask[r]);
				221
				222	__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				223	/* c1 c1 c1 c1 c0 c0 c0 c0 */
				224	coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
				225	__m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				226	coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
				227
				228	__m128i src8, src16, mul_hi, mul_lo, t;
				229
				230	ITERATION(src_data[0] + start, accum0);
				231	ITERATION(src_data[1] + start, accum1);
				232	ITERATION(src_data[2] + start, accum2);
				233	ITERATION(src_data[3] + start, accum3);
				234	}
				235
				236	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				237	accum0 = _mm_packs_epi32(accum0, zero);
				238	accum0 = _mm_packus_epi16(accum0, zero);
				239	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				240	accum1 = _mm_packs_epi32(accum1, zero);
				241	accum1 = _mm_packus_epi16(accum1, zero);
				242	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				243	accum2 = _mm_packs_epi32(accum2, zero);
				244	accum2 = _mm_packus_epi16(accum2, zero);
				245	accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
				246	accum3 = _mm_packs_epi32(accum3, zero);
				247	accum3 = _mm_packus_epi16(accum3, zero);
				248
				249	(reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);
				250	(reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);
				251	(reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);
				252	(reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);
				253
				254	out_row[0] += 4;
				255	out_row[1] += 4;
				256	out_row[2] += 4;
				257	out_row[3] += 4;
				258	}
				259	}
				260
				261	// Does vertical convolution to produce one output row. The filter values and
				262	// length are given in the first two parameters. These are applied to each
				263	// of the rows pointed to in the \|source_data_rows\| array, with each row
				264	// being \|pixel_width\| wide.
				265	//
				266	// The output must have room for \|pixel_width * 4\| bytes.
				267	template<bool has_alpha>
				268	void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
				269	int filter_length,
				270	unsigned char* const* source_data_rows,
				271	int pixel_width,
				272	unsigned char* out_row) {
				273	int width = pixel_width & ~3;
				274
				275	__m128i zero = _mm_setzero_si128();
				276	__m128i accum0, accum1, accum2, accum3, coeff16;
				277	const __m128i* src;
				278	// Output four pixels per iteration (16 bytes).
				279	for (int out_x = 0; out_x < width; out_x += 4) {
				280
				281	// Accumulated result for each pixel. 32 bits per RGBA channel.
				282	accum0 = _mm_setzero_si128();
				283	accum1 = _mm_setzero_si128();
				284	accum2 = _mm_setzero_si128();
				285	accum3 = _mm_setzero_si128();
				286
				287	// Convolve with one filter coefficient per iteration.
				288	for (int filter_y = 0; filter_y < filter_length; filter_y++) {
				289
				290	// Duplicate the filter coefficient 8 times.
				291	// [16] cj cj cj cj cj cj cj cj
				292	coeff16 = _mm_set1_epi16(filter_values[filter_y]);
				293
				294	// Load four pixels (16 bytes) together.
				295	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				296	src = reinterpret_cast<const __m128i*>(
				297	&source_data_rows[filter_y][out_x << 2]);
				298	__m128i src8 = _mm_loadu_si128(src);
				299
				300	// Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
				301	// multiply with current coefficient => accumulate the result.
				302	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				303	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				304	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				305	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				306	// [32] a0 b0 g0 r0
				307	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				308	accum0 = _mm_add_epi32(accum0, t);
				309	// [32] a1 b1 g1 r1
				310	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				311	accum1 = _mm_add_epi32(accum1, t);
				312
				313	// Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
				314	// multiply with current coefficient => accumulate the result.
				315	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				316	src16 = _mm_unpackhi_epi8(src8, zero);
				317	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				318	mul_lo = _mm_mullo_epi16(src16, coeff16);
				319	// [32] a2 b2 g2 r2
				320	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				321	accum2 = _mm_add_epi32(accum2, t);
				322	// [32] a3 b3 g3 r3
				323	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				324	accum3 = _mm_add_epi32(accum3, t);
				325	}
				326
				327	// Shift right for fixed point implementation.
				328	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				329	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				330	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				331	accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
				332
				333	// Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).
				334	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				335	accum0 = _mm_packs_epi32(accum0, accum1);
				336	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				337	accum2 = _mm_packs_epi32(accum2, accum3);
				338
				339	// Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).
				340	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				341	accum0 = _mm_packus_epi16(accum0, accum2);
				342
				343	if (has_alpha) {
				344	// Compute the max(ri, gi, bi) for each pixel.
				345	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
				346	__m128i a = _mm_srli_epi32(accum0, 8);
				347	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				348	__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
				349	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
				350	a = _mm_srli_epi32(accum0, 16);
				351	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				352	b = _mm_max_epu8(a, b); // Max of r and g and b.
				353	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
				354	b = _mm_slli_epi32(b, 24);
				355
				356	// Make sure the value of alpha channel is always larger than maximum
				357	// value of color channels.
				358	accum0 = _mm_max_epu8(b, accum0);
				359	} else {
				360	// Set value of alpha channels to 0xFF.
				361	__m128i mask = _mm_set1_epi32(0xff000000);
				362	accum0 = _mm_or_si128(accum0, mask);
				363	}
				364
				365	// Store the convolution result (16 bytes) and advance the pixel pointers.
				366	_mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
				367	out_row += 16;
				368	}
				369
				370	// When the width of the output is not divisible by 4, We need to save one
				371	// pixel (4 bytes) each time. And also the fourth pixel is always absent.
				372	if (pixel_width & 3) {
				373	accum0 = _mm_setzero_si128();
				374	accum1 = _mm_setzero_si128();
				375	accum2 = _mm_setzero_si128();
				376	for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
				377	coeff16 = _mm_set1_epi16(filter_values[filter_y]);
				378	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				379	src = reinterpret_cast<const __m128i*>(
				380	&source_data_rows[filter_y][width<<2]);
				381	__m128i src8 = _mm_loadu_si128(src);
				382	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				383	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				384	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				385	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				386	// [32] a0 b0 g0 r0
				387	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				388	accum0 = _mm_add_epi32(accum0, t);
				389	// [32] a1 b1 g1 r1
				390	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				391	accum1 = _mm_add_epi32(accum1, t);
				392	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				393	src16 = _mm_unpackhi_epi8(src8, zero);
				394	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				395	mul_lo = _mm_mullo_epi16(src16, coeff16);
				396	// [32] a2 b2 g2 r2
				397	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				398	accum2 = _mm_add_epi32(accum2, t);
				399	}
				400
				401	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				402	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				403	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				404	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				405	accum0 = _mm_packs_epi32(accum0, accum1);
				406	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				407	accum2 = _mm_packs_epi32(accum2, zero);
				408	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				409	accum0 = _mm_packus_epi16(accum0, accum2);
				410	if (has_alpha) {
				411	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
				412	__m128i a = _mm_srli_epi32(accum0, 8);
				413	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				414	__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
				415	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
				416	a = _mm_srli_epi32(accum0, 16);
				417	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				418	b = _mm_max_epu8(a, b); // Max of r and g and b.
				419	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
				420	b = _mm_slli_epi32(b, 24);
				421	accum0 = _mm_max_epu8(b, accum0);
				422	} else {
				423	__m128i mask = _mm_set1_epi32(0xff000000);
				424	accum0 = _mm_or_si128(accum0, mask);
				425	}
				426
				427	for (int out_x = width; out_x < pixel_width; out_x++) {
				428	(reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);
				429	accum0 = _mm_srli_si128(accum0, 4);
				430	out_row += 4;
				431	}
				432	}
				433	}
				434
				435	void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
				436	int filter_length,
				437	unsigned char* const* source_data_rows,
				438	int pixel_width,
				439	unsigned char* out_row,
				440	bool has_alpha) {
				441	if (has_alpha) {
				442	ConvolveVertically_SSE2<true>(filter_values,
				443	filter_length,
				444	source_data_rows,
				445	pixel_width,
				446	out_row);
				447	} else {
				448	ConvolveVertically_SSE2<false>(filter_values,
				449	filter_length,
				450	source_data_rows,
				451	pixel_width,
				452	out_row);
				453	}
				454	}
				455
				456	} // namespace skia