• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libswscale/ppc/yuv2rgb_altivec.c

Go to the documentation of this file.
00001 /*
00002  * AltiVec acceleration for colorspace conversion
00003  *
00004  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
00005  *
00006  * This file is part of Libav.
00007  *
00008  * Libav is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * Libav is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with Libav; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 /*
00024 Convert I420 YV12 to RGB in various formats,
00025   it rejects images that are not in 420 formats,
00026   it rejects images that don't have widths of multiples of 16,
00027   it rejects images that don't have heights of multiples of 2.
00028 Reject defers to C simulation code.
00029 
00030 Lots of optimizations to be done here.
00031 
00032 1. Need to fix saturation code. I just couldn't get it to fly with packs
00033    and adds, so we currently use max/min to clip.
00034 
00035 2. The inefficient use of chroma loading needs a bit of brushing up.
00036 
00037 3. Analysis of pipeline stalls needs to be done. Use shark to identify
00038    pipeline stalls.
00039 
00040 
00041 MODIFIED to calculate coeffs from currently selected color space.
00042 MODIFIED core to be a macro where you specify the output format.
00043 ADDED UYVY conversion which is never called due to some thing in swscale.
00044 CORRECTED algorithim selection to be strict on input formats.
00045 ADDED runtime detection of AltiVec.
00046 
00047 ADDED altivec_yuv2packedX vertical scl + RGB converter
00048 
00049 March 27,2004
00050 PERFORMANCE ANALYSIS
00051 
00052 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
00053 used as test.
00054 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
00055 same sequence.
00056 
00057 720 * 480 * 30  ~10MPS
00058 
00059 so we have roughly 10 clocks per pixel. This is too high, something has
00060 to be wrong.
00061 
00062 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
00063 need for vec_min.
00064 
00065 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
00066 the input video frame, it was just decompressed so it probably resides in L1
00067 caches. However, we are creating the output video stream. This needs to use the
00068 DSTST instruction to optimize for the cache. We couple this with the fact that
00069 we are not going to be visiting the input buffer again so we mark it Least
00070 Recently Used. This shaves 25% of the processor cycles off.
00071 
00072 Now memcpy is the largest mips consumer in the system, probably due
00073 to the inefficient X11 stuff.
00074 
00075 GL libraries seem to be very slow on this machine 1.33Ghz PB running
00076 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
00077 a versioning issue, however I have libGL.1.2.dylib for both
00078 machines. (We need to figure this out now.)
00079 
00080 GL2 libraries work now with patch for RGB32.
00081 
00082 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
00083 
00084 Integrated luma prescaling adjustment for saturation/contrast/brightness
00085 adjustment.
00086 */
00087 
00088 #include <stdio.h>
00089 #include <stdlib.h>
00090 #include <string.h>
00091 #include <inttypes.h>
00092 #include <assert.h>
00093 #include "config.h"
00094 #include "libswscale/rgb2rgb.h"
00095 #include "libswscale/swscale.h"
00096 #include "libswscale/swscale_internal.h"
00097 #include "libavutil/cpu.h"
00098 #include "yuv2rgb_altivec.h"
00099 
00100 #undef PROFILE_THE_BEAST
00101 #undef INC_SCALING
00102 
00103 typedef unsigned char ubyte;
00104 typedef signed char   sbyte;
00105 
00106 
00107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
00108    homogeneous vector registers x0,x1,x2 are interleaved with the
00109    following technique:
00110 
00111       o0 = vec_mergeh (x0,x1);
00112       o1 = vec_perm (o0, x2, perm_rgb_0);
00113       o2 = vec_perm (o0, x2, perm_rgb_1);
00114       o3 = vec_mergel (x0,x1);
00115       o4 = vec_perm (o3,o2,perm_rgb_2);
00116       o5 = vec_perm (o3,o2,perm_rgb_3);
00117 
00118   perm_rgb_0:   o0(RG).h v1(B) --> o1*
00119               0   1  2   3   4
00120              rgbr|gbrg|brgb|rgbr
00121              0010 0100 1001 0010
00122              0102 3145 2673 894A
00123 
00124   perm_rgb_1:   o0(RG).h v1(B) --> o2
00125               0   1  2   3   4
00126              gbrg|brgb|bbbb|bbbb
00127              0100 1001 1111 1111
00128              B5CD 6EF7 89AB CDEF
00129 
00130   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
00131               0   1  2   3   4
00132              gbrg|brgb|rgbr|gbrg
00133              1111 1111 0010 0100
00134              89AB CDEF 0182 3945
00135 
00136   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
00137               0   1  2   3   4
00138              brgb|rgbr|gbrg|brgb
00139              1001 0010 0100 1001
00140              a67b 89cA BdCD eEFf
00141 
00142 */
00143 static
00144 const vector unsigned char
00145   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
00146                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
00147   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
00148                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
00149   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
00150                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
00151   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
00152                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
00153 
00154 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
00155 do {                                        \
00156     __typeof__(x0) o0,o2,o3;                \
00157         o0 = vec_mergeh (x0,x1);            \
00158         y0 = vec_perm (o0, x2, perm_rgb_0); \
00159         o2 = vec_perm (o0, x2, perm_rgb_1); \
00160         o3 = vec_mergel (x0,x1);            \
00161         y1 = vec_perm (o3,o2,perm_rgb_2);   \
00162         y2 = vec_perm (o3,o2,perm_rgb_3);   \
00163 } while(0)
00164 
00165 #define vec_mstbgr24(x0,x1,x2,ptr)      \
00166 do {                                    \
00167     __typeof__(x0) _0,_1,_2;            \
00168     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
00169     vec_st (_0, 0, ptr++);              \
00170     vec_st (_1, 0, ptr++);              \
00171     vec_st (_2, 0, ptr++);              \
00172 }  while (0)
00173 
00174 #define vec_mstrgb24(x0,x1,x2,ptr)      \
00175 do {                                    \
00176     __typeof__(x0) _0,_1,_2;            \
00177     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
00178     vec_st (_0, 0, ptr++);              \
00179     vec_st (_1, 0, ptr++);              \
00180     vec_st (_2, 0, ptr++);              \
00181 }  while (0)
00182 
00183 /* pack the pixels in rgb0 format
00184    msb R
00185    lsb 0
00186 */
00187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
00188 do {                                                                          \
00189     T _0,_1,_2,_3;                                                            \
00190     _0 = vec_mergeh (x0,x1);                                                  \
00191     _1 = vec_mergeh (x2,x3);                                                  \
00192     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00193     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00194     vec_st (_2, 0*16, (T *)ptr);                                              \
00195     vec_st (_3, 1*16, (T *)ptr);                                              \
00196     _0 = vec_mergel (x0,x1);                                                  \
00197     _1 = vec_mergel (x2,x3);                                                  \
00198     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00199     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00200     vec_st (_2, 2*16, (T *)ptr);                                              \
00201     vec_st (_3, 3*16, (T *)ptr);                                              \
00202     ptr += 4;                                                                 \
00203 }  while (0)
00204 
00205 /*
00206 
00207   | 1     0       1.4021   | | Y |
00208   | 1    -0.3441 -0.7142   |x| Cb|
00209   | 1     1.7718  0        | | Cr|
00210 
00211 
00212   Y:      [-128 127]
00213   Cb/Cr : [-128 127]
00214 
00215   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
00216 
00217 */
00218 
00219 
00220 
00221 
00222 #define vec_unh(x) \
00223     (vector signed short) \
00224         vec_perm(x,(__typeof__(x)){0}, \
00225                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
00226                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
00227 #define vec_unl(x) \
00228     (vector signed short) \
00229         vec_perm(x,(__typeof__(x)){0}, \
00230                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
00231                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
00232 
00233 #define vec_clip_s16(x) \
00234     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
00235                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
00236 
00237 #define vec_packclp(x,y) \
00238     (vector unsigned char)vec_packs \
00239         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
00240          (vector unsigned short)vec_max (y,((vector signed short) {0})))
00241 
00242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
00243 
00244 
00245 static inline void cvtyuvtoRGB (SwsContext *c,
00246                                 vector signed short Y, vector signed short U, vector signed short V,
00247                                 vector signed short *R, vector signed short *G, vector signed short *B)
00248 {
00249     vector signed   short vx,ux,uvx;
00250 
00251     Y = vec_mradds (Y, c->CY, c->OY);
00252     U  = vec_sub (U,(vector signed short)
00253                     vec_splat((vector signed short){128},0));
00254     V  = vec_sub (V,(vector signed short)
00255                     vec_splat((vector signed short){128},0));
00256 
00257     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
00258     ux = vec_sl (U, c->CSHIFT);
00259     *B = vec_mradds (ux, c->CBU, Y);
00260 
00261     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
00262     vx = vec_sl (V, c->CSHIFT);
00263     *R = vec_mradds (vx, c->CRV, Y);
00264 
00265     // uvx = ((CGU*u) + (CGV*v))>>15;
00266     uvx = vec_mradds (U, c->CGU, Y);
00267     *G  = vec_mradds (V, c->CGV, uvx);
00268 }
00269 
00270 
00271 /*
00272   ------------------------------------------------------------------------------
00273   CS converters
00274   ------------------------------------------------------------------------------
00275 */
00276 
00277 
00278 #define DEFCSP420_CVT(name,out_pixels)                                  \
00279 static int altivec_##name (SwsContext *c,                               \
00280                            const unsigned char **in, int *instrides,    \
00281                            int srcSliceY,        int srcSliceH,         \
00282                            unsigned char **oplanes, int *outstrides)    \
00283 {                                                                       \
00284     int w = c->srcW;                                                    \
00285     int h = srcSliceH;                                                  \
00286     int i,j;                                                            \
00287     int instrides_scl[3];                                               \
00288     vector unsigned char y0,y1;                                         \
00289                                                                         \
00290     vector signed char  u,v;                                            \
00291                                                                         \
00292     vector signed short Y0,Y1,Y2,Y3;                                    \
00293     vector signed short U,V;                                            \
00294     vector signed short vx,ux,uvx;                                      \
00295     vector signed short vx0,ux0,uvx0;                                   \
00296     vector signed short vx1,ux1,uvx1;                                   \
00297     vector signed short R0,G0,B0;                                       \
00298     vector signed short R1,G1,B1;                                       \
00299     vector unsigned char R,G,B;                                         \
00300                                                                         \
00301     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
00302     vector unsigned char align_perm;                                    \
00303                                                                         \
00304     vector signed short                                                 \
00305         lCY  = c->CY,                                                   \
00306         lOY  = c->OY,                                                   \
00307         lCRV = c->CRV,                                                  \
00308         lCBU = c->CBU,                                                  \
00309         lCGU = c->CGU,                                                  \
00310         lCGV = c->CGV;                                                  \
00311                                                                         \
00312     vector unsigned short lCSHIFT = c->CSHIFT;                          \
00313                                                                         \
00314     const ubyte *y1i   = in[0];                                         \
00315     const ubyte *y2i   = in[0]+instrides[0];                            \
00316     const ubyte *ui    = in[1];                                         \
00317     const ubyte *vi    = in[2];                                         \
00318                                                                         \
00319     vector unsigned char *oute                                          \
00320         = (vector unsigned char *)                                      \
00321             (oplanes[0]+srcSliceY*outstrides[0]);                       \
00322     vector unsigned char *outo                                          \
00323         = (vector unsigned char *)                                      \
00324             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
00325                                                                         \
00326                                                                         \
00327     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
00328     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
00329     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
00330                                                                         \
00331                                                                         \
00332     for (i=0;i<h/2;i++) {                                               \
00333         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
00334         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
00335                                                                         \
00336         for (j=0;j<w/16;j++) {                                          \
00337                                                                         \
00338             y1ivP = (vector unsigned char *)y1i;                        \
00339             y2ivP = (vector unsigned char *)y2i;                        \
00340             uivP  = (vector unsigned char *)ui;                         \
00341             vivP  = (vector unsigned char *)vi;                         \
00342                                                                         \
00343             align_perm = vec_lvsl (0, y1i);                             \
00344             y0 = (vector unsigned char)                                 \
00345                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
00346                                                                         \
00347             align_perm = vec_lvsl (0, y2i);                             \
00348             y1 = (vector unsigned char)                                 \
00349                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
00350                                                                         \
00351             align_perm = vec_lvsl (0, ui);                              \
00352             u = (vector signed char)                                    \
00353                 vec_perm (uivP[0], uivP[1], align_perm);                \
00354                                                                         \
00355             align_perm = vec_lvsl (0, vi);                              \
00356             v = (vector signed char)                                    \
00357                 vec_perm (vivP[0], vivP[1], align_perm);                \
00358                                                                         \
00359             u  = (vector signed char)                                   \
00360                  vec_sub (u,(vector signed char)                        \
00361                           vec_splat((vector signed char){128},0));      \
00362             v  = (vector signed char)                                   \
00363                  vec_sub (v,(vector signed char)                        \
00364                           vec_splat((vector signed char){128},0));      \
00365                                                                         \
00366             U  = vec_unpackh (u);                                       \
00367             V  = vec_unpackh (v);                                       \
00368                                                                         \
00369                                                                         \
00370             Y0 = vec_unh (y0);                                          \
00371             Y1 = vec_unl (y0);                                          \
00372             Y2 = vec_unh (y1);                                          \
00373             Y3 = vec_unl (y1);                                          \
00374                                                                         \
00375             Y0 = vec_mradds (Y0, lCY, lOY);                             \
00376             Y1 = vec_mradds (Y1, lCY, lOY);                             \
00377             Y2 = vec_mradds (Y2, lCY, lOY);                             \
00378             Y3 = vec_mradds (Y3, lCY, lOY);                             \
00379                                                                         \
00380             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
00381             ux = vec_sl (U, lCSHIFT);                                   \
00382             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
00383             ux0  = vec_mergeh (ux,ux);                                  \
00384             ux1  = vec_mergel (ux,ux);                                  \
00385                                                                         \
00386             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
00387             vx = vec_sl (V, lCSHIFT);                                   \
00388             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
00389             vx0  = vec_mergeh (vx,vx);                                  \
00390             vx1  = vec_mergel (vx,vx);                                  \
00391                                                                         \
00392             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
00393             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
00394             uvx = vec_mradds (V, lCGV, uvx);                            \
00395             uvx0 = vec_mergeh (uvx,uvx);                                \
00396             uvx1 = vec_mergel (uvx,uvx);                                \
00397                                                                         \
00398             R0 = vec_add (Y0,vx0);                                      \
00399             G0 = vec_add (Y0,uvx0);                                     \
00400             B0 = vec_add (Y0,ux0);                                      \
00401             R1 = vec_add (Y1,vx1);                                      \
00402             G1 = vec_add (Y1,uvx1);                                     \
00403             B1 = vec_add (Y1,ux1);                                      \
00404                                                                         \
00405             R  = vec_packclp (R0,R1);                                   \
00406             G  = vec_packclp (G0,G1);                                   \
00407             B  = vec_packclp (B0,B1);                                   \
00408                                                                         \
00409             out_pixels(R,G,B,oute);                                     \
00410                                                                         \
00411             R0 = vec_add (Y2,vx0);                                      \
00412             G0 = vec_add (Y2,uvx0);                                     \
00413             B0 = vec_add (Y2,ux0);                                      \
00414             R1 = vec_add (Y3,vx1);                                      \
00415             G1 = vec_add (Y3,uvx1);                                     \
00416             B1 = vec_add (Y3,ux1);                                      \
00417             R  = vec_packclp (R0,R1);                                   \
00418             G  = vec_packclp (G0,G1);                                   \
00419             B  = vec_packclp (B0,B1);                                   \
00420                                                                         \
00421                                                                         \
00422             out_pixels(R,G,B,outo);                                     \
00423                                                                         \
00424             y1i  += 16;                                                 \
00425             y2i  += 16;                                                 \
00426             ui   += 8;                                                  \
00427             vi   += 8;                                                  \
00428                                                                         \
00429         }                                                               \
00430                                                                         \
00431         outo  += (outstrides[0])>>4;                                    \
00432         oute  += (outstrides[0])>>4;                                    \
00433                                                                         \
00434         ui    += instrides_scl[1];                                      \
00435         vi    += instrides_scl[2];                                      \
00436         y1i   += instrides_scl[0];                                      \
00437         y2i   += instrides_scl[0];                                      \
00438     }                                                                   \
00439     return srcSliceH;                                                   \
00440 }
00441 
00442 
00443 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
00444 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
00445 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
00446 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
00447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
00448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
00449 
00450 DEFCSP420_CVT (yuv2_abgr, out_abgr)
00451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
00452 DEFCSP420_CVT (yuv2_rgba, out_rgba)
00453 DEFCSP420_CVT (yuv2_argb, out_argb)
00454 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
00455 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
00456 
00457 
00458 // uyvy|uyvy|uyvy|uyvy
00459 // 0123 4567 89ab cdef
00460 static
00461 const vector unsigned char
00462     demux_u = {0x10,0x00,0x10,0x00,
00463                0x10,0x04,0x10,0x04,
00464                0x10,0x08,0x10,0x08,
00465                0x10,0x0c,0x10,0x0c},
00466     demux_v = {0x10,0x02,0x10,0x02,
00467                0x10,0x06,0x10,0x06,
00468                0x10,0x0A,0x10,0x0A,
00469                0x10,0x0E,0x10,0x0E},
00470     demux_y = {0x10,0x01,0x10,0x03,
00471                0x10,0x05,0x10,0x07,
00472                0x10,0x09,0x10,0x0B,
00473                0x10,0x0D,0x10,0x0F};
00474 
00475 /*
00476   this is so I can play live CCIR raw video
00477 */
00478 static int altivec_uyvy_rgb32 (SwsContext *c,
00479                                const unsigned char **in, int *instrides,
00480                                int srcSliceY,        int srcSliceH,
00481                                unsigned char **oplanes, int *outstrides)
00482 {
00483     int w = c->srcW;
00484     int h = srcSliceH;
00485     int i,j;
00486     vector unsigned char uyvy;
00487     vector signed   short Y,U,V;
00488     vector signed   short R0,G0,B0,R1,G1,B1;
00489     vector unsigned char  R,G,B;
00490     vector unsigned char *out;
00491     const ubyte *img;
00492 
00493     img = in[0];
00494     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
00495 
00496     for (i=0;i<h;i++) {
00497         for (j=0;j<w/16;j++) {
00498             uyvy = vec_ld (0, img);
00499             U = (vector signed short)
00500                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00501 
00502             V = (vector signed short)
00503                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00504 
00505             Y = (vector signed short)
00506                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00507 
00508             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
00509 
00510             uyvy = vec_ld (16, img);
00511             U = (vector signed short)
00512                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00513 
00514             V = (vector signed short)
00515                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00516 
00517             Y = (vector signed short)
00518                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00519 
00520             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
00521 
00522             R  = vec_packclp (R0,R1);
00523             G  = vec_packclp (G0,G1);
00524             B  = vec_packclp (B0,B1);
00525 
00526             //      vec_mstbgr24 (R,G,B, out);
00527             out_rgba (R,G,B,out);
00528 
00529             img += 32;
00530         }
00531     }
00532     return srcSliceH;
00533 }
00534 
00535 
00536 
00537 /* Ok currently the acceleration routine only supports
00538    inputs of widths a multiple of 16
00539    and heights a multiple 2
00540 
00541    So we just fall back to the C codes for this.
00542 */
00543 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
00544 {
00545     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
00546         return NULL;
00547 
00548     /*
00549       and this seems not to matter too much I tried a bunch of
00550       videos with abnormal widths and MPlayer crashes elsewhere.
00551       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
00552       boom with X11 bad match.
00553 
00554     */
00555     if ((c->srcW & 0xf) != 0)    return NULL;
00556 
00557     switch (c->srcFormat) {
00558     case PIX_FMT_YUV410P:
00559     case PIX_FMT_YUV420P:
00560     /*case IMGFMT_CLPL:        ??? */
00561     case PIX_FMT_GRAY8:
00562     case PIX_FMT_NV12:
00563     case PIX_FMT_NV21:
00564         if ((c->srcH & 0x1) != 0)
00565             return NULL;
00566 
00567         switch(c->dstFormat) {
00568         case PIX_FMT_RGB24:
00569             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
00570             return altivec_yuv2_rgb24;
00571         case PIX_FMT_BGR24:
00572             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
00573             return altivec_yuv2_bgr24;
00574         case PIX_FMT_ARGB:
00575             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
00576             return altivec_yuv2_argb;
00577         case PIX_FMT_ABGR:
00578             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
00579             return altivec_yuv2_abgr;
00580         case PIX_FMT_RGBA:
00581             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
00582             return altivec_yuv2_rgba;
00583         case PIX_FMT_BGRA:
00584             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
00585             return altivec_yuv2_bgra;
00586         default: return NULL;
00587         }
00588         break;
00589 
00590     case PIX_FMT_UYVY422:
00591         switch(c->dstFormat) {
00592         case PIX_FMT_BGR32:
00593             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
00594             return altivec_uyvy_rgb32;
00595         default: return NULL;
00596         }
00597         break;
00598 
00599     }
00600     return NULL;
00601 }
00602 
00603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
00604 {
00605     union {
00606         DECLARE_ALIGNED(16, signed short, tmp)[8];
00607         vector signed short vec;
00608     } buf;
00609 
00610     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
00611     buf.tmp[1] =  -256*brightness;                                      //oy
00612     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
00613     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
00614     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
00615     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
00616 
00617 
00618     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
00619     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
00620     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
00621     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
00622     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
00623     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
00624     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
00625     return;
00626 }
00627 
00628 
00629 static av_always_inline void
00630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
00631                        const int16_t **lumSrc, int lumFilterSize,
00632                        const int16_t *chrFilter, const int16_t **chrUSrc,
00633                        const int16_t **chrVSrc, int chrFilterSize,
00634                        const int16_t **alpSrc, uint8_t *dest,
00635                        int dstW, int dstY, enum PixelFormat target)
00636 {
00637     int i,j;
00638     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
00639     vector signed short R0,G0,B0,R1,G1,B1;
00640 
00641     vector unsigned char R,G,B;
00642     vector unsigned char *out,*nout;
00643 
00644     vector signed short   RND = vec_splat_s16(1<<3);
00645     vector unsigned short SCL = vec_splat_u16(4);
00646     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
00647 
00648     vector signed short *YCoeffs, *CCoeffs;
00649 
00650     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
00651     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
00652 
00653     out = (vector unsigned char *)dest;
00654 
00655     for (i=0; i<dstW; i+=16) {
00656         Y0 = RND;
00657         Y1 = RND;
00658         /* extract 16 coeffs from lumSrc */
00659         for (j=0; j<lumFilterSize; j++) {
00660             X0 = vec_ld (0,  &lumSrc[j][i]);
00661             X1 = vec_ld (16, &lumSrc[j][i]);
00662             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00663             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00664         }
00665 
00666         U = RND;
00667         V = RND;
00668         /* extract 8 coeffs from U,V */
00669         for (j=0; j<chrFilterSize; j++) {
00670             X  = vec_ld (0, &chrUSrc[j][i/2]);
00671             U  = vec_mradds (X, CCoeffs[j], U);
00672             X  = vec_ld (0, &chrVSrc[j][i/2]);
00673             V  = vec_mradds (X, CCoeffs[j], V);
00674         }
00675 
00676         /* scale and clip signals */
00677         Y0 = vec_sra (Y0, SCL);
00678         Y1 = vec_sra (Y1, SCL);
00679         U  = vec_sra (U,  SCL);
00680         V  = vec_sra (V,  SCL);
00681 
00682         Y0 = vec_clip_s16 (Y0);
00683         Y1 = vec_clip_s16 (Y1);
00684         U  = vec_clip_s16 (U);
00685         V  = vec_clip_s16 (V);
00686 
00687         /* now we have
00688           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00689           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
00690 
00691           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00692           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00693           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00694         */
00695 
00696         U0 = vec_mergeh (U,U);
00697         V0 = vec_mergeh (V,V);
00698 
00699         U1 = vec_mergel (U,U);
00700         V1 = vec_mergel (V,V);
00701 
00702         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00703         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00704 
00705         R  = vec_packclp (R0,R1);
00706         G  = vec_packclp (G0,G1);
00707         B  = vec_packclp (B0,B1);
00708 
00709         switch(target) {
00710         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
00711         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
00712         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
00713         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
00714         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
00715         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
00716         default:
00717             {
00718                 /* If this is reached, the caller should have called yuv2packedXinC
00719                    instead. */
00720                 static int printed_error_message;
00721                 if (!printed_error_message) {
00722                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00723                            sws_format_name(c->dstFormat));
00724                     printed_error_message=1;
00725                 }
00726                 return;
00727             }
00728         }
00729     }
00730 
00731     if (i < dstW) {
00732         i -= 16;
00733 
00734         Y0 = RND;
00735         Y1 = RND;
00736         /* extract 16 coeffs from lumSrc */
00737         for (j=0; j<lumFilterSize; j++) {
00738             X0 = vec_ld (0,  &lumSrc[j][i]);
00739             X1 = vec_ld (16, &lumSrc[j][i]);
00740             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00741             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00742         }
00743 
00744         U = RND;
00745         V = RND;
00746         /* extract 8 coeffs from U,V */
00747         for (j=0; j<chrFilterSize; j++) {
00748             X  = vec_ld (0, &chrUSrc[j][i/2]);
00749             U  = vec_mradds (X, CCoeffs[j], U);
00750             X  = vec_ld (0, &chrVSrc[j][i/2]);
00751             V  = vec_mradds (X, CCoeffs[j], V);
00752         }
00753 
00754         /* scale and clip signals */
00755         Y0 = vec_sra (Y0, SCL);
00756         Y1 = vec_sra (Y1, SCL);
00757         U  = vec_sra (U,  SCL);
00758         V  = vec_sra (V,  SCL);
00759 
00760         Y0 = vec_clip_s16 (Y0);
00761         Y1 = vec_clip_s16 (Y1);
00762         U  = vec_clip_s16 (U);
00763         V  = vec_clip_s16 (V);
00764 
00765         /* now we have
00766            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00767            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
00768 
00769            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00770            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00771            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00772         */
00773 
00774         U0 = vec_mergeh (U,U);
00775         V0 = vec_mergeh (V,V);
00776 
00777         U1 = vec_mergel (U,U);
00778         V1 = vec_mergel (V,V);
00779 
00780         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00781         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00782 
00783         R  = vec_packclp (R0,R1);
00784         G  = vec_packclp (G0,G1);
00785         B  = vec_packclp (B0,B1);
00786 
00787         nout = (vector unsigned char *)scratch;
00788         switch(target) {
00789         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
00790         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
00791         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
00792         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
00793         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
00794         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
00795         default:
00796             /* Unreachable, I think. */
00797             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00798                    sws_format_name(c->dstFormat));
00799             return;
00800         }
00801 
00802         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
00803     }
00804 
00805 }
00806 
00807 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
00808 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
00809                             const int16_t **lumSrc, int lumFilterSize, \
00810                             const int16_t *chrFilter, const int16_t **chrUSrc, \
00811                             const int16_t **chrVSrc, int chrFilterSize, \
00812                             const int16_t **alpSrc, uint8_t *dest, \
00813                             int dstW, int dstY) \
00814 { \
00815     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
00816                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
00817                            alpSrc, dest, dstW, dstY, pixfmt); \
00818 }
00819 
00820 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
00821 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
00822 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
00823 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
00824 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
00825 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);
Generated on Thu Jul 11 2013 15:38:25 for Libav by doxygen 1.7.1