• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/ppc/dsputil_ppc.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2002 Brian Foley
00003  * Copyright (c) 2002 Dieter Shirley
00004  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of Libav.
00007  *
00008  * Libav is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * Libav is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with Libav; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavutil/cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "dsputil_altivec.h"
00026 
00027 /* ***** WARNING ***** WARNING ***** WARNING ***** */
00028 /*
00029 clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
00030 cache line size not equal to 32 bytes.
00031 Fortunately all processor used by Apple up to at least the 7450 (aka second
00032 generation G4) use 32 bytes cache line.
00033 This is due to the use of the 'dcbz' instruction. It simply clear to zero a
00034 single cache line, so you need to know the cache line size to use it !
00035 It's absurd, but it's fast...
00036 
00037 update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
00038 size: 128 bytes. Oups.
00039 The semantic of dcbz was changed, it always clear 32 bytes. so the function
00040 below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
00041 which is defined to clear a cache line (as dcbz before). So we still can
00042 distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
00043 
00044 see <http://developer.apple.com/technotes/tn/tn2087.html>
00045 and <http://developer.apple.com/technotes/tn/tn2086.html>
00046 */
00047 static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
00048 {
00049     register int misal = ((unsigned long)blocks & 0x00000010);
00050     register int i = 0;
00051     if (misal) {
00052         ((unsigned long*)blocks)[0] = 0L;
00053         ((unsigned long*)blocks)[1] = 0L;
00054         ((unsigned long*)blocks)[2] = 0L;
00055         ((unsigned long*)blocks)[3] = 0L;
00056         i += 16;
00057     }
00058     for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
00059         __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
00060     }
00061     if (misal) {
00062         ((unsigned long*)blocks)[188] = 0L;
00063         ((unsigned long*)blocks)[189] = 0L;
00064         ((unsigned long*)blocks)[190] = 0L;
00065         ((unsigned long*)blocks)[191] = 0L;
00066         i += 16;
00067     }
00068 }
00069 
00070 /* same as above, when dcbzl clear a whole 128B cache line
00071    i.e. the PPC970 aka G5 */
00072 #if HAVE_DCBZL
00073 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00074 {
00075     register int misal = ((unsigned long)blocks & 0x0000007f);
00076     register int i = 0;
00077     if (misal) {
00078         // we could probably also optimize this case,
00079         // but there's not much point as the machines
00080         // aren't available yet (2003-06-26)
00081         memset(blocks, 0, sizeof(DCTELEM)*6*64);
00082     }
00083     else
00084         for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
00085             __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
00086         }
00087 }
00088 #else
00089 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00090 {
00091     memset(blocks, 0, sizeof(DCTELEM)*6*64);
00092 }
00093 #endif
00094 
00095 #if HAVE_DCBZL
00096 /* check dcbz report how many bytes are set to 0 by dcbz */
00097 /* update 24/06/2003 : replace dcbz by dcbzl to get
00098    the intended effect (Apple "fixed" dcbz)
00099    unfortunately this cannot be used unless the assembler
00100    knows about dcbzl ... */
00101 static long check_dcbzl_effect(void)
00102 {
00103     register char *fakedata = av_malloc(1024);
00104     register char *fakedata_middle;
00105     register long zero = 0;
00106     register long i = 0;
00107     long count = 0;
00108 
00109     if (!fakedata) {
00110         return 0L;
00111     }
00112 
00113     fakedata_middle = (fakedata + 512);
00114 
00115     memset(fakedata, 0xFF, 1024);
00116 
00117     /* below the constraint "b" seems to mean "Address base register"
00118        in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
00119     __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
00120 
00121     for (i = 0; i < 1024 ; i ++) {
00122         if (fakedata[i] == (char)0)
00123             count++;
00124     }
00125 
00126     av_free(fakedata);
00127 
00128     return count;
00129 }
00130 #else
00131 static long check_dcbzl_effect(void)
00132 {
00133   return 0;
00134 }
00135 #endif
00136 
00137 static void prefetch_ppc(void *mem, int stride, int h)
00138 {
00139     register const uint8_t *p = mem;
00140     do {
00141         __asm__ volatile ("dcbt 0,%0" : : "r" (p));
00142         p+= stride;
00143     } while(--h);
00144 }
00145 
00146 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
00147 {
00148     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
00149 
00150     // Common optimizations whether AltiVec is available or not
00151     c->prefetch = prefetch_ppc;
00152     if (!high_bit_depth) {
00153     switch (check_dcbzl_effect()) {
00154         case 32:
00155             c->clear_blocks = clear_blocks_dcbz32_ppc;
00156             break;
00157         case 128:
00158             c->clear_blocks = clear_blocks_dcbz128_ppc;
00159             break;
00160         default:
00161             break;
00162     }
00163     }
00164 
00165 #if HAVE_ALTIVEC
00166     if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
00167 
00168     if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
00169         dsputil_init_altivec(c, avctx);
00170         float_init_altivec(c, avctx);
00171         int_init_altivec(c, avctx);
00172         c->gmc1 = gmc1_altivec;
00173 
00174 #if CONFIG_ENCODERS
00175         if (avctx->bits_per_raw_sample <= 8 &&
00176             (avctx->dct_algo == FF_DCT_AUTO ||
00177              avctx->dct_algo == FF_DCT_ALTIVEC)) {
00178             c->fdct = fdct_altivec;
00179         }
00180 #endif //CONFIG_ENCODERS
00181 
00182         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
00183             if ((avctx->idct_algo == FF_IDCT_AUTO) ||
00184                 (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
00185                 c->idct_put = idct_put_altivec;
00186                 c->idct_add = idct_add_altivec;
00187                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00188             }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
00189                      avctx->idct_algo==FF_IDCT_VP3){
00190                 c->idct_put = ff_vp3_idct_put_altivec;
00191                 c->idct_add = ff_vp3_idct_add_altivec;
00192                 c->idct     = ff_vp3_idct_altivec;
00193                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00194             }
00195         }
00196 
00197     }
00198 #endif /* HAVE_ALTIVEC */
00199 }
Generated on Thu Jul 11 2013 15:38:21 for Libav by doxygen 1.7.1