Skip to content

Commit fcb5d3a

Browse files
author
Grok Compression
committed
GrkImage: optimize some routines
1 parent 3cc4b5f commit fcb5d3a

4 files changed

Lines changed: 731 additions & 96 deletions

File tree

src/lib/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(GROK_LIBRARY_SRCS
7676
${CMAKE_CURRENT_SOURCE_DIR}/util/Logger.cpp
7777
${CMAKE_CURRENT_SOURCE_DIR}/util/SparseBuffer.cpp
7878
${CMAKE_CURRENT_SOURCE_DIR}/util/GrkImage.cpp
79+
${CMAKE_CURRENT_SOURCE_DIR}/util/GrkImageSIMD.cpp
7980
${CMAKE_CURRENT_SOURCE_DIR}/util/GrkMatrix.cpp
8081
${CMAKE_CURRENT_SOURCE_DIR}/util/lanes.cpp
8182

src/lib/core/util/GrkImage.h

Lines changed: 163 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "packer.h"
2727
#include "GrkImageMeta.h"
28+
#include "GrkImageSIMD.h"
2829

2930
namespace grk
3031
{
@@ -564,14 +565,22 @@ void clip(grk_image_comp* component, uint8_t precision)
564565
}
565566

566567
// Clip the data
567-
for(uint32_t j = 0; j < component->h; ++j)
568+
if constexpr(std::is_same_v<T, int32_t>)
568569
{
569-
for(uint32_t i = 0; i < component->w; ++i)
570+
hwy_clip_i32(data, component->w, component->h, component->stride, (int32_t)minimum,
571+
(int32_t)maximum);
572+
}
573+
else
574+
{
575+
for(uint32_t j = 0; j < component->h; ++j)
570576
{
571-
data[index] = std::clamp<T>(data[index], minimum, maximum);
572-
index++;
577+
for(uint32_t i = 0; i < component->w; ++i)
578+
{
579+
data[index] = std::clamp<T>(data[index], minimum, maximum);
580+
index++;
581+
}
582+
index += stride_diff;
573583
}
574-
index += stride_diff;
575584
}
576585
component->prec = precision;
577586
}
@@ -678,45 +687,53 @@ bool GrkImage::color_esycc_to_rgb(void)
678687
auto yd = (T*)comps[0].data;
679688
auto bd = (T*)comps[1].data;
680689
auto rd = (T*)comps[2].data;
681-
for(uint32_t j = 0; j < h; ++j)
690+
691+
if constexpr(std::is_same_v<T, int32_t>)
682692
{
683-
for(uint32_t i = 0; i < w; ++i)
693+
hwy_esycc_to_rgb_i32(yd, bd, rd, w, h, comps[0].stride, max_value, flip_value, sign1, sign2);
694+
}
695+
else
696+
{
697+
for(uint32_t j = 0; j < h; ++j)
684698
{
685-
T y = yd[dest_index];
686-
T cb = bd[dest_index];
687-
T cr = rd[dest_index];
688-
689-
if(!sign1)
690-
cb -= flip_value;
691-
if(!sign2)
692-
cr -= flip_value;
693-
694-
T val = (T)(y - 0.0000368 * cb + 1.40199 * cr + 0.5);
695-
696-
if(val > max_value)
697-
val = max_value;
698-
else if(val < 0)
699-
val = 0;
700-
yd[dest_index] = val;
701-
702-
val = (T)(1.0003 * y - 0.344125 * cb - 0.7141128 * cr + 0.5);
703-
704-
if(val > max_value)
705-
val = max_value;
706-
else if(val < 0)
707-
val = 0;
708-
bd[dest_index] = val;
709-
710-
val = (T)(0.999823 * y + 1.77204 * cb - 0.000008 * cr + 0.5);
711-
712-
if(val > max_value)
713-
val = max_value;
714-
else if(val < 0)
715-
val = 0;
716-
rd[dest_index] = val;
717-
dest_index++;
699+
for(uint32_t i = 0; i < w; ++i)
700+
{
701+
T y = yd[dest_index];
702+
T cb = bd[dest_index];
703+
T cr = rd[dest_index];
704+
705+
if(!sign1)
706+
cb -= flip_value;
707+
if(!sign2)
708+
cr -= flip_value;
709+
710+
T val = (T)(y - 0.0000368 * cb + 1.40199 * cr + 0.5);
711+
712+
if(val > max_value)
713+
val = max_value;
714+
else if(val < 0)
715+
val = 0;
716+
yd[dest_index] = val;
717+
718+
val = (T)(1.0003 * y - 0.344125 * cb - 0.7141128 * cr + 0.5);
719+
720+
if(val > max_value)
721+
val = max_value;
722+
else if(val < 0)
723+
val = 0;
724+
bd[dest_index] = val;
725+
726+
val = (T)(0.999823 * y + 1.77204 * cb - 0.000008 * cr + 0.5);
727+
728+
if(val > max_value)
729+
val = max_value;
730+
else if(val < 0)
731+
val = 0;
732+
rd[dest_index] = val;
733+
dest_index++;
734+
}
735+
dest_index += stride_diff;
718736
}
719-
dest_index += stride_diff;
720737
}
721738
color_space = GRK_CLRSPC_SRGB;
722739

@@ -853,23 +870,37 @@ void GrkImage::scaleComponent(grk_image_comp* component, uint8_t precision)
853870
if(component->prec < precision)
854871
{
855872
T scale = (T)(1ULL << diff);
856-
size_t index = 0;
857-
for(uint32_t j = 0; j < component->h; ++j)
873+
if constexpr(std::is_same_v<T, int32_t>)
858874
{
859-
for(uint32_t i = 0; i < component->w; ++i)
860-
data[index++] *= scale;
861-
index += stride_diff;
875+
hwy_scale_mul_i32(data, component->w, component->h, component->stride, scale);
876+
}
877+
else
878+
{
879+
size_t index = 0;
880+
for(uint32_t j = 0; j < component->h; ++j)
881+
{
882+
for(uint32_t i = 0; i < component->w; ++i)
883+
data[index++] *= scale;
884+
index += stride_diff;
885+
}
862886
}
863887
}
864888
else
865889
{
866890
T scale = (T)(1ULL << diff);
867-
size_t index = 0;
868-
for(uint32_t j = 0; j < component->h; ++j)
891+
if constexpr(std::is_same_v<T, int32_t>)
869892
{
870-
for(uint32_t i = 0; i < component->w; ++i)
871-
data[index++] /= scale;
872-
index += stride_diff;
893+
hwy_scale_div_i32(data, component->w, component->h, component->stride, scale);
894+
}
895+
else
896+
{
897+
size_t index = 0;
898+
for(uint32_t j = 0; j < component->h; ++j)
899+
{
900+
for(uint32_t i = 0; i < component->w; ++i)
901+
data[index++] /= scale;
902+
index += stride_diff;
903+
}
873904
}
874905
}
875906
component->prec = precision;
@@ -990,16 +1021,24 @@ bool GrkImage::sycc444_to_rgb(void)
9901021
dst->comps[1].data = nullptr;
9911022
dst->comps[2].data = nullptr;
9921023

993-
for(uint32_t j = 0; j < h; ++j)
1024+
if constexpr(std::is_same_v<T, int32_t>)
9941025
{
995-
for(uint32_t i = 0; i < w; ++i)
996-
sycc_to_rgb<T>(offset, upb, *y++, *cb++, *cr++, r++, g++, b++);
997-
y += src_stride_diff;
998-
cb += src_stride_diff;
999-
cr += src_stride_diff;
1000-
r += dst_stride_diff;
1001-
g += dst_stride_diff;
1002-
b += dst_stride_diff;
1026+
hwy_sycc444_to_rgb_i32(y, cb, cr, r, g, b, w, h, comps[0].stride, dst->comps[0].stride,
1027+
offset, upb);
1028+
}
1029+
else
1030+
{
1031+
for(uint32_t j = 0; j < h; ++j)
1032+
{
1033+
for(uint32_t i = 0; i < w; ++i)
1034+
sycc_to_rgb<T>(offset, upb, *y++, *cb++, *cr++, r++, g++, b++);
1035+
y += src_stride_diff;
1036+
cb += src_stride_diff;
1037+
cr += src_stride_diff;
1038+
r += dst_stride_diff;
1039+
g += dst_stride_diff;
1040+
b += dst_stride_diff;
1041+
}
10031042
}
10041043

10051044
all_components_data_free();
@@ -1735,18 +1774,25 @@ bool GrkImage::applyICC(void)
17351774
auto g = (T*)comps[1].data;
17361775
auto b = (T*)comps[2].data;
17371776

1738-
size_t src_index = 0;
1739-
size_t dest_index = 0;
1740-
for(uint32_t j = 0; j < h; ++j)
1777+
if constexpr(std::is_same_v<T, int32_t>)
17411778
{
1742-
for(uint32_t i = 0; i < w; ++i)
1779+
hwy_planar_to_packed_8(r, g, b, inbuf, w, h, comps[0].stride);
1780+
}
1781+
else
1782+
{
1783+
size_t src_index = 0;
1784+
size_t dest_index = 0;
1785+
for(uint32_t j = 0; j < h; ++j)
17431786
{
1744-
inbuf[dest_index++] = (uint8_t)r[src_index];
1745-
inbuf[dest_index++] = (uint8_t)g[src_index];
1746-
inbuf[dest_index++] = (uint8_t)b[src_index];
1747-
src_index++;
1787+
for(uint32_t i = 0; i < w; ++i)
1788+
{
1789+
inbuf[dest_index++] = (uint8_t)r[src_index];
1790+
inbuf[dest_index++] = (uint8_t)g[src_index];
1791+
inbuf[dest_index++] = (uint8_t)b[src_index];
1792+
src_index++;
1793+
}
1794+
src_index += stride_diff;
17481795
}
1749-
src_index += stride_diff;
17501796
}
17511797

17521798
if(w > UINT32_MAX / 3)
@@ -1759,18 +1805,25 @@ bool GrkImage::applyICC(void)
17591805

17601806
cmsDoTransformLineStride(transform, inbuf, outbuf, w, h, 3 * w, 3 * w, 0, 0);
17611807

1762-
src_index = 0;
1763-
dest_index = 0;
1764-
for(uint32_t j = 0; j < h; ++j)
1808+
if constexpr(std::is_same_v<T, int32_t>)
17651809
{
1766-
for(uint32_t i = 0; i < w; ++i)
1810+
hwy_packed_to_planar_8(outbuf, r, g, b, w, h, comps[0].stride);
1811+
}
1812+
else
1813+
{
1814+
size_t src_index = 0;
1815+
size_t dest_index = 0;
1816+
for(uint32_t j = 0; j < h; ++j)
17671817
{
1768-
r[dest_index] = (T)outbuf[src_index++];
1769-
g[dest_index] = (T)outbuf[src_index++];
1770-
b[dest_index] = (T)outbuf[src_index++];
1771-
dest_index++;
1818+
for(uint32_t i = 0; i < w; ++i)
1819+
{
1820+
r[dest_index] = (T)outbuf[src_index++];
1821+
g[dest_index] = (T)outbuf[src_index++];
1822+
b[dest_index] = (T)outbuf[src_index++];
1823+
dest_index++;
1824+
}
1825+
dest_index += stride_diff;
17721826
}
1773-
dest_index += stride_diff;
17741827
}
17751828
delete[] inbuf;
17761829
delete[] outbuf;
@@ -1790,18 +1843,25 @@ bool GrkImage::applyICC(void)
17901843
auto g = (T*)comps[1].data;
17911844
auto b = (T*)comps[2].data;
17921845

1793-
size_t src_index = 0;
1794-
size_t dest_index = 0;
1795-
for(uint32_t j = 0; j < h; ++j)
1846+
if constexpr(std::is_same_v<T, int32_t>)
17961847
{
1797-
for(uint32_t i = 0; i < w; ++i)
1848+
hwy_planar_to_packed_16(r, g, b, inbuf, w, h, comps[0].stride);
1849+
}
1850+
else
1851+
{
1852+
size_t src_index = 0;
1853+
size_t dest_index = 0;
1854+
for(uint32_t j = 0; j < h; ++j)
17981855
{
1799-
inbuf[dest_index++] = (uint16_t)r[src_index];
1800-
inbuf[dest_index++] = (uint16_t)g[src_index];
1801-
inbuf[dest_index++] = (uint16_t)b[src_index];
1802-
src_index++;
1856+
for(uint32_t i = 0; i < w; ++i)
1857+
{
1858+
inbuf[dest_index++] = (uint16_t)r[src_index];
1859+
inbuf[dest_index++] = (uint16_t)g[src_index];
1860+
inbuf[dest_index++] = (uint16_t)b[src_index];
1861+
src_index++;
1862+
}
1863+
src_index += stride_diff;
18031864
}
1804-
src_index += stride_diff;
18051865
}
18061866

18071867
if(w > UINT32_MAX / (3 * sizeof(uint16_t)))
@@ -1813,18 +1873,25 @@ bool GrkImage::applyICC(void)
18131873
}
18141874
cmsDoTransformLineStride(transform, inbuf, outbuf, w, h, 3 * w * sizeof(uint16_t),
18151875
3 * w * sizeof(uint16_t), 0, 0);
1816-
src_index = 0;
1817-
dest_index = 0;
1818-
for(uint32_t j = 0; j < h; ++j)
1876+
if constexpr(std::is_same_v<T, int32_t>)
18191877
{
1820-
for(uint32_t i = 0; i < w; ++i)
1878+
hwy_packed_to_planar_16(outbuf, r, g, b, w, h, comps[0].stride);
1879+
}
1880+
else
1881+
{
1882+
size_t src_index = 0;
1883+
size_t dest_index = 0;
1884+
for(uint32_t j = 0; j < h; ++j)
18211885
{
1822-
r[dest_index] = (T)outbuf[src_index++];
1823-
g[dest_index] = (T)outbuf[src_index++];
1824-
b[dest_index] = (T)outbuf[src_index++];
1825-
dest_index++;
1886+
for(uint32_t i = 0; i < w; ++i)
1887+
{
1888+
r[dest_index] = (T)outbuf[src_index++];
1889+
g[dest_index] = (T)outbuf[src_index++];
1890+
b[dest_index] = (T)outbuf[src_index++];
1891+
dest_index++;
1892+
}
1893+
dest_index += stride_diff;
18261894
}
1827-
dest_index += stride_diff;
18281895
}
18291896
delete[] inbuf;
18301897
delete[] outbuf;

0 commit comments

Comments
 (0)