Blame - skia/ext/convolver_SSE2.cc - chromium/src.git

blob: a77a1f45c41942834ca0d3668557bd41bd9c885b [file] [log] [blame]

[email protected]	c1ca658	2013-04-09 00:32:02	[diff] [blame]	1	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include <algorithm>
				6
				7	#include "skia/ext/convolver.h"
				8	#include "skia/ext/convolver_SSE2.h"
				9	#include "third_party/skia/include/core/SkTypes.h"
				10
				11	#include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
				12
				13	namespace skia {
				14
				15	// Convolves horizontally along a single row. The row data is given in
				16	// \|src_data\| and continues for the num_values() of the filter.
				17	void ConvolveHorizontally_SSE2(const unsigned char* src_data,
				18	const ConvolutionFilter1D& filter,
[email protected]	c0e4e8d29	2013-05-24 22:20:49	[diff] [blame]	19	unsigned char* out_row,
				20	bool /has_alpha/) {
[email protected]	c1ca658	2013-04-09 00:32:02	[diff] [blame]	21	int num_values = filter.num_values();
				22
				23	int filter_offset, filter_length;
				24	__m128i zero = _mm_setzero_si128();
				25	__m128i mask[4];
				26	// \|mask\| will be used to decimate all extra filter coefficients that are
				27	// loaded by SIMD when \|filter_length\| is not divisible by 4.
				28	// mask[0] is not used in following algorithm.
				29	mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
				30	mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
				31	mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
				32
				33	// Output one pixel each iteration, calculating all channels (RGBA) together.
				34	for (int out_x = 0; out_x < num_values; out_x++) {
				35	const ConvolutionFilter1D::Fixed* filter_values =
				36	filter.FilterForValue(out_x, &filter_offset, &filter_length);
				37
				38	__m128i accum = _mm_setzero_si128();
				39
				40	// Compute the first pixel in this row that the filter affects. It will
				41	// touch \|filter_length\| pixels (4 bytes each) after this.
				42	const __m128i* row_to_filter =
				43	reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
				44
				45	// We will load and accumulate with four coefficients per iteration.
				46	for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
				47
				48	// Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
				49	__m128i coeff, coeff16;
				50	// [16] xx xx xx xx c3 c2 c1 c0
				51	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				52	// [16] xx xx xx xx c1 c1 c0 c0
				53	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				54	// [16] c1 c1 c1 c1 c0 c0 c0 c0
				55	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				56
				57	// Load four pixels => unpack the first two pixels to 16 bits =>
				58	// multiply with coefficients => accumulate the convolution result.
				59	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				60	__m128i src8 = _mm_loadu_si128(row_to_filter);
				61	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				62	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				63	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				64	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				65	// [32] a0c0 b0c0 g0c0 r0c0
				66	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				67	accum = _mm_add_epi32(accum, t);
				68	// [32] a1c1 b1c1 g1c1 r1c1
				69	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				70	accum = _mm_add_epi32(accum, t);
				71
				72	// Duplicate 3rd and 4th coefficients for all channels =>
				73	// unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
				74	// => accumulate the convolution results.
				75	// [16] xx xx xx xx c3 c3 c2 c2
				76	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				77	// [16] c3 c3 c3 c3 c2 c2 c2 c2
				78	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				79	// [16] a3 g3 b3 r3 a2 g2 b2 r2
				80	src16 = _mm_unpackhi_epi8(src8, zero);
				81	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				82	mul_lo = _mm_mullo_epi16(src16, coeff16);
				83	// [32] a2c2 b2c2 g2c2 r2c2
				84	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				85	accum = _mm_add_epi32(accum, t);
				86	// [32] a3c3 b3c3 g3c3 r3c3
				87	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				88	accum = _mm_add_epi32(accum, t);
				89
				90	// Advance the pixel and coefficients pointers.
				91	row_to_filter += 1;
				92	filter_values += 4;
				93	}
				94
				95	// When \|filter_length\| is not divisible by 4, we need to decimate some of
				96	// the filter coefficient that was loaded incorrectly to zero; Other than
				97	// that the algorithm is same with above, exceot that the 4th pixel will be
				98	// always absent.
				99	int r = filter_length&3;
				100	if (r) {
				101	// Note: filter_values must be padded to align_up(filter_offset, 8).
				102	__m128i coeff, coeff16;
				103	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				104	// Mask out extra filter taps.
				105	coeff = _mm_and_si128(coeff, mask[r]);
				106	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				107	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				108
				109	// Note: line buffer must be padded to align_up(filter_offset, 16).
				110	// We resolve this by use C-version for the last horizontal line.
				111	__m128i src8 = _mm_loadu_si128(row_to_filter);
				112	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				113	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				114	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				115	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				116	accum = _mm_add_epi32(accum, t);
				117	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				118	accum = _mm_add_epi32(accum, t);
				119
				120	src16 = _mm_unpackhi_epi8(src8, zero);
				121	coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				122	coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
				123	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				124	mul_lo = _mm_mullo_epi16(src16, coeff16);
				125	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				126	accum = _mm_add_epi32(accum, t);
				127	}
				128
				129	// Shift right for fixed point implementation.
				130	accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
				131
				132	// Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).
				133	accum = _mm_packs_epi32(accum, zero);
				134	// Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).
				135	accum = _mm_packus_epi16(accum, zero);
				136
				137	// Store the pixel value of 32 bits.
				138	(reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);
				139	out_row += 4;
				140	}
				141	}
				142
				143	// Convolves horizontally along four rows. The row data is given in
				144	// \|src_data\| and continues for the num_values() of the filter.
				145	// The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please
				146	// refer to that function for detailed comments.
				147	void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
				148	const ConvolutionFilter1D& filter,
				149	unsigned char* out_row[4]) {
				150	int num_values = filter.num_values();
				151
				152	int filter_offset, filter_length;
				153	__m128i zero = _mm_setzero_si128();
				154	__m128i mask[4];
				155	// \|mask\| will be used to decimate all extra filter coefficients that are
				156	// loaded by SIMD when \|filter_length\| is not divisible by 4.
				157	// mask[0] is not used in following algorithm.
				158	mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
				159	mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
				160	mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
				161
				162	// Output one pixel each iteration, calculating all channels (RGBA) together.
				163	for (int out_x = 0; out_x < num_values; out_x++) {
				164	const ConvolutionFilter1D::Fixed* filter_values =
				165	filter.FilterForValue(out_x, &filter_offset, &filter_length);
				166
				167	// four pixels in a column per iteration.
				168	__m128i accum0 = _mm_setzero_si128();
				169	__m128i accum1 = _mm_setzero_si128();
				170	__m128i accum2 = _mm_setzero_si128();
				171	__m128i accum3 = _mm_setzero_si128();
				172	int start = (filter_offset<<2);
				173	// We will load and accumulate with four coefficients per iteration.
				174	for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
				175	__m128i coeff, coeff16lo, coeff16hi;
				176	// [16] xx xx xx xx c3 c2 c1 c0
				177	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				178	// [16] xx xx xx xx c1 c1 c0 c0
				179	coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				180	// [16] c1 c1 c1 c1 c0 c0 c0 c0
				181	coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
				182	// [16] xx xx xx xx c3 c3 c2 c2
				183	coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				184	// [16] c3 c3 c3 c3 c2 c2 c2 c2
				185	coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
				186
				187	__m128i src8, src16, mul_hi, mul_lo, t;
				188
				189	#define ITERATION(src, accum) \
				190	src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
				191	src16 = _mm_unpacklo_epi8(src8, zero); \
				192	mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
				193	mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
				194	t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
				195	accum = _mm_add_epi32(accum, t); \
				196	t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
				197	accum = _mm_add_epi32(accum, t); \
				198	src16 = _mm_unpackhi_epi8(src8, zero); \
				199	mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
				200	mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
				201	t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
				202	accum = _mm_add_epi32(accum, t); \
				203	t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
				204	accum = _mm_add_epi32(accum, t)
				205
				206	ITERATION(src_data[0] + start, accum0);
				207	ITERATION(src_data[1] + start, accum1);
				208	ITERATION(src_data[2] + start, accum2);
				209	ITERATION(src_data[3] + start, accum3);
				210
				211	start += 16;
				212	filter_values += 4;
				213	}
				214
				215	int r = filter_length & 3;
				216	if (r) {
				217	// Note: filter_values must be padded to align_up(filter_offset, 8);
				218	__m128i coeff;
				219	coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
				220	// Mask out extra filter taps.
				221	coeff = _mm_and_si128(coeff, mask[r]);
				222
				223	__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
				224	/* c1 c1 c1 c1 c0 c0 c0 c0 */
				225	coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
				226	__m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
				227	coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
				228
				229	__m128i src8, src16, mul_hi, mul_lo, t;
				230
				231	ITERATION(src_data[0] + start, accum0);
				232	ITERATION(src_data[1] + start, accum1);
				233	ITERATION(src_data[2] + start, accum2);
				234	ITERATION(src_data[3] + start, accum3);
				235	}
				236
				237	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				238	accum0 = _mm_packs_epi32(accum0, zero);
				239	accum0 = _mm_packus_epi16(accum0, zero);
				240	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				241	accum1 = _mm_packs_epi32(accum1, zero);
				242	accum1 = _mm_packus_epi16(accum1, zero);
				243	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				244	accum2 = _mm_packs_epi32(accum2, zero);
				245	accum2 = _mm_packus_epi16(accum2, zero);
				246	accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
				247	accum3 = _mm_packs_epi32(accum3, zero);
				248	accum3 = _mm_packus_epi16(accum3, zero);
				249
				250	(reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);
				251	(reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);
				252	(reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);
				253	(reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);
				254
				255	out_row[0] += 4;
				256	out_row[1] += 4;
				257	out_row[2] += 4;
				258	out_row[3] += 4;
				259	}
				260	}
				261
				262	// Does vertical convolution to produce one output row. The filter values and
				263	// length are given in the first two parameters. These are applied to each
				264	// of the rows pointed to in the \|source_data_rows\| array, with each row
				265	// being \|pixel_width\| wide.
				266	//
				267	// The output must have room for \|pixel_width * 4\| bytes.
				268	template<bool has_alpha>
				269	void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
				270	int filter_length,
				271	unsigned char* const* source_data_rows,
				272	int pixel_width,
				273	unsigned char* out_row) {
				274	int width = pixel_width & ~3;
				275
				276	__m128i zero = _mm_setzero_si128();
				277	__m128i accum0, accum1, accum2, accum3, coeff16;
				278	const __m128i* src;
				279	// Output four pixels per iteration (16 bytes).
				280	for (int out_x = 0; out_x < width; out_x += 4) {
				281
				282	// Accumulated result for each pixel. 32 bits per RGBA channel.
				283	accum0 = _mm_setzero_si128();
				284	accum1 = _mm_setzero_si128();
				285	accum2 = _mm_setzero_si128();
				286	accum3 = _mm_setzero_si128();
				287
				288	// Convolve with one filter coefficient per iteration.
				289	for (int filter_y = 0; filter_y < filter_length; filter_y++) {
				290
				291	// Duplicate the filter coefficient 8 times.
				292	// [16] cj cj cj cj cj cj cj cj
				293	coeff16 = _mm_set1_epi16(filter_values[filter_y]);
				294
				295	// Load four pixels (16 bytes) together.
				296	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				297	src = reinterpret_cast<const __m128i*>(
				298	&source_data_rows[filter_y][out_x << 2]);
				299	__m128i src8 = _mm_loadu_si128(src);
				300
				301	// Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
				302	// multiply with current coefficient => accumulate the result.
				303	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				304	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				305	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				306	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				307	// [32] a0 b0 g0 r0
				308	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				309	accum0 = _mm_add_epi32(accum0, t);
				310	// [32] a1 b1 g1 r1
				311	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				312	accum1 = _mm_add_epi32(accum1, t);
				313
				314	// Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
				315	// multiply with current coefficient => accumulate the result.
				316	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				317	src16 = _mm_unpackhi_epi8(src8, zero);
				318	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				319	mul_lo = _mm_mullo_epi16(src16, coeff16);
				320	// [32] a2 b2 g2 r2
				321	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				322	accum2 = _mm_add_epi32(accum2, t);
				323	// [32] a3 b3 g3 r3
				324	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				325	accum3 = _mm_add_epi32(accum3, t);
				326	}
				327
				328	// Shift right for fixed point implementation.
				329	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				330	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				331	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				332	accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
				333
				334	// Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).
				335	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				336	accum0 = _mm_packs_epi32(accum0, accum1);
				337	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				338	accum2 = _mm_packs_epi32(accum2, accum3);
				339
				340	// Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).
				341	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				342	accum0 = _mm_packus_epi16(accum0, accum2);
				343
				344	if (has_alpha) {
				345	// Compute the max(ri, gi, bi) for each pixel.
				346	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
				347	__m128i a = _mm_srli_epi32(accum0, 8);
				348	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				349	__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
				350	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
				351	a = _mm_srli_epi32(accum0, 16);
				352	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				353	b = _mm_max_epu8(a, b); // Max of r and g and b.
				354	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
				355	b = _mm_slli_epi32(b, 24);
				356
				357	// Make sure the value of alpha channel is always larger than maximum
				358	// value of color channels.
				359	accum0 = _mm_max_epu8(b, accum0);
				360	} else {
				361	// Set value of alpha channels to 0xFF.
				362	__m128i mask = _mm_set1_epi32(0xff000000);
				363	accum0 = _mm_or_si128(accum0, mask);
				364	}
				365
				366	// Store the convolution result (16 bytes) and advance the pixel pointers.
				367	_mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
				368	out_row += 16;
				369	}
				370
				371	// When the width of the output is not divisible by 4, We need to save one
				372	// pixel (4 bytes) each time. And also the fourth pixel is always absent.
				373	if (pixel_width & 3) {
				374	accum0 = _mm_setzero_si128();
				375	accum1 = _mm_setzero_si128();
				376	accum2 = _mm_setzero_si128();
				377	for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
				378	coeff16 = _mm_set1_epi16(filter_values[filter_y]);
				379	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				380	src = reinterpret_cast<const __m128i*>(
				381	&source_data_rows[filter_y][width<<2]);
				382	__m128i src8 = _mm_loadu_si128(src);
				383	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				384	__m128i src16 = _mm_unpacklo_epi8(src8, zero);
				385	__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
				386	__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
				387	// [32] a0 b0 g0 r0
				388	__m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				389	accum0 = _mm_add_epi32(accum0, t);
				390	// [32] a1 b1 g1 r1
				391	t = _mm_unpackhi_epi16(mul_lo, mul_hi);
				392	accum1 = _mm_add_epi32(accum1, t);
				393	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				394	src16 = _mm_unpackhi_epi8(src8, zero);
				395	mul_hi = _mm_mulhi_epi16(src16, coeff16);
				396	mul_lo = _mm_mullo_epi16(src16, coeff16);
				397	// [32] a2 b2 g2 r2
				398	t = _mm_unpacklo_epi16(mul_lo, mul_hi);
				399	accum2 = _mm_add_epi32(accum2, t);
				400	}
				401
				402	accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
				403	accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
				404	accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
				405	// [16] a1 b1 g1 r1 a0 b0 g0 r0
				406	accum0 = _mm_packs_epi32(accum0, accum1);
				407	// [16] a3 b3 g3 r3 a2 b2 g2 r2
				408	accum2 = _mm_packs_epi32(accum2, zero);
				409	// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
				410	accum0 = _mm_packus_epi16(accum0, accum2);
				411	if (has_alpha) {
				412	// [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
				413	__m128i a = _mm_srli_epi32(accum0, 8);
				414	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				415	__m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
				416	// [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
				417	a = _mm_srli_epi32(accum0, 16);
				418	// [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
				419	b = _mm_max_epu8(a, b); // Max of r and g and b.
				420	// [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
				421	b = _mm_slli_epi32(b, 24);
				422	accum0 = _mm_max_epu8(b, accum0);
				423	} else {
				424	__m128i mask = _mm_set1_epi32(0xff000000);
				425	accum0 = _mm_or_si128(accum0, mask);
				426	}
				427
				428	for (int out_x = width; out_x < pixel_width; out_x++) {
				429	(reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);
				430	accum0 = _mm_srli_si128(accum0, 4);
				431	out_row += 4;
				432	}
				433	}
				434	}
				435
				436	void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
				437	int filter_length,
				438	unsigned char* const* source_data_rows,
				439	int pixel_width,
				440	unsigned char* out_row,
				441	bool has_alpha) {
				442	if (has_alpha) {
				443	ConvolveVertically_SSE2<true>(filter_values,
				444	filter_length,
				445	source_data_rows,
				446	pixel_width,
				447	out_row);
				448	} else {
				449	ConvolveVertically_SSE2<false>(filter_values,
				450	filter_length,
				451	source_data_rows,
				452	pixel_width,
				453	out_row);
				454	}
				455	}
				456
				457	} // namespace skia