Demonstrate.c

/*
        File: Demonstrate.c
    Abstract: vDSP AltiVec examples.
     Version: 1.2
    
    Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple
    Inc. ("Apple") in consideration of your agreement to the following
    terms, and your use, installation, modification or redistribution of
    this Apple software constitutes acceptance of these terms.  If you do
    not agree with these terms, please do not use, install, modify or
    redistribute this Apple software.
    
    In consideration of your agreement to abide by the following terms, and
    subject to these terms, Apple grants you a personal, non-exclusive
    license, under Apple's copyrights in this original Apple software (the
    "Apple Software"), to use, reproduce, modify and redistribute the Apple
    Software, with or without modifications, in source and/or binary forms;
    provided that if you redistribute the Apple Software in its entirety and
    without modifications, you must retain this notice and the following
    text and disclaimers in all such redistributions of the Apple Software.
    Neither the name, trademarks, service marks or logos of Apple Inc. may
    be used to endorse or promote products derived from the Apple Software
    without specific prior written permission from Apple.  Except as
    expressly stated in this notice, no other rights or licenses, express or
    implied, are granted by Apple herein, including but not limited to any
    patent rights that may be infringed by your derivative works or by other
    works in which the Apple Software may be incorporated.
    
    The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
    MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
    THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
    FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
    OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
    
    IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
    MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
    AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
    STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.
    
    Copyright (C) 2012 Apple Inc. All Rights Reserved.
    
 
    File: Demonstrate.c
    
    Description:
        Main routine and some subroutines for the vDSP AltiVec examples.
    
    Copyright:
        Copyright (C) 2007 Apple Inc.  All rights reserved.
*/
 
 
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
#include <sys/param.h>
#include <sys/sysctl.h>
 
#include <Accelerate/Accelerate.h>
 
#include "Demonstrate.h"
 
 
/*  Here we define some minor subroutines.  If this were a larger
    application, these would likely be defined in a separate file, rather
    than in Demonstrate.c.
*/
 
 
#if defined __ppc__ || defined __ppc64__
 
    /*  Some PowerPC processors have AltiVec (including those in G4 and
        G5 systems) and some do not (including those in G3 systems).
        If you want to write code that uses AltiVec features to run
        quickly on G4 or G5 systems but still works (more slowly) on G3
        systems, you may need to test for the presence of AltiVec
        features at run time.
 
        To start, we will define a Boolean value that we will set to
        indicate whether AltiVec is available or not.
    */
    static _Bool HasVector;
 
    /*  This routine initializes the HasVector object above by asking
        the operating system whether AltiVec features are present.
    */
    static void InitializeHasVector(void)
    {
        unsigned int HasAltiVec;
        size_t SizeOfHasAltiVec = sizeof HasAltiVec;
 
        // Ask sysctlbyname for the value of "hw.optional.altivec".
        if (0 != sysctlbyname("hw.optional.altivec",
            &HasAltiVec, &SizeOfHasAltiVec, NULL, 0))
        {
            /*  sysctlbyname failed.  That should not happen on
                a PowerPC, although of course an Intel system
                will report there is no value for
                "hw.optional.altivec".
            */
            fprintf(stderr,
"Error, sysctlbyname(\"hw.optional.altivec\") failed with\n"
"errno %d:  %s.\n",
                errno, strerror(errno));
            exit(EXIT_FAILURE);
        }
 
        HasVector = HasAltiVec;
 
        /*  Now code can tell whether AltiVec is available by
            checking the value of HasVector.
        */
    }
 
#else   // defined __ppc__ || defined __ppc64__
 
    // If we are not on a PowerPC, define InitializeHasVector to do nothing.
    #define InitializeHasVector()
 
    /*  Since this is not being compiled for a PowerPC, we will assume
        it is being compiled for an Intel system that has vector
        features, since all Apple systems have SSE2 at least.  (If you
        are writing code for other systems, you may need additional
        tests here.)
    */
    #define HasVector   1
 
#endif  // defined __ppc__ || defined __ppc64__
 
 
/*  This next section defines some things to change the floating-point
    math environment.  Both AltiVec and Intel (IA-32 and EM64T) processors
    execute floating-point operations more quickly when they can treat
    subnormal numbers as zero, so we want to enable that mode.
 
    In summary:
 
        PowerPC and Intel processors can be set to handle subnormal
        floating-point numbers (essentially those with tiny exponents,
        very near zero) as specified by the IEEE 754 floating-point
        standard or to convert such numbers to zero when they are used
        or produced in floating-point arithmetic instructions.  Let us
        call the former mode "conformant" and the latter mode "fast".
 
        On some PowerPC processors, in conformant mode, all vector
        floating-point arithmetic instructions take an extra CPU cycle.
 
        On some Intel processors, in conformant mode, vector
        floating-point arithmetic instructions that encounter a
        subnormal number generate a trap that may take around a
        thousand CPU cycles to handle.
 
    On Intel processors, code compiled with Apple GCC uses vector
    instructions for all floating-point arithmetic.  Thus, all
    floating-point arithmetic is affected if a subnormal is encountered and
    is affected by the selection of conformant mode or fast mode.
 
    Because the penalty on PowerPC processors affects all applications
    with vector floating-point instructions, you will want to set fast mode
    unless your application requires proper handling of subnormal numbers.
 
    Because the penalty on Intel processors only affects applications when
    subnormal numbers are encountered, your application might not suffer
    any penalty, and you could leave the processor in conformant mode (the
    default).  However, if subnormal numbers are encountered, the
    performance penalty can be huge.  It can also be intermittent, causing
    erratic operation that is fast sometimes and slow sometimes.  To avoid
    this, you can set fast mode.
 
    The following section defines:
 
        MathEnvironment
 
            An object type that can record the floating-point
            environment.
 
        FastMathEnvironment
 
            An environment in which fast mode is set.
 
        MathEnvironment SetMathEnvironment(MathEnvironment New)
 
            A function that sets a new environment and returns the
            old one.
 
            Note that SetMathEnvironment sets the entire
            environment, not just the mode for which subnormal
            numbers are handled.  Notably, the AltiVec saturation
            bit is changed and the Intel rounding and exception
            bits are changed.
 
    You can use the symbols above to set the environment without
    additional information about the details.  For those who wish to know
    the details of the modes:
 
        On PowerPC, the subnormal handling is controlled by the
        non-Java mode bit.  Non-Java mode is the fast mode, and Java
        mode is the conformant mode.  It is unfortunate the mode is
        named in the negative, as a non-Java mode rather than a Java
        mode, but it is that way in the AltiVec specification.
 
        On Intel, the subnormal handling is controlled by two bits in
        the MXSCR, the FZ (flush-to-zero, subnormal outputs are
        replaced by zero) and DAZ (denormals-are-zeros, subnormal
        inputs are replaced by zero).
*/
#if defined __ppc__ || defined __ppc64__
 
    typedef vUInt32 MathEnvironment;
 
    // Define an environment with the non-Java bit set.
    #define FastMathEnvironment ((vUInt32) (1<<16))
 
    /*  Two implementations of SetMathEnvironment follow.  The first is
        for when the AltiVec language extensions are available.  The
        second is a stub routine for when the extensions are
        unavailable.  (When the extensions are unavailable, it is
        still possible to use GCC assembly language features to access
        the necessary processor register.  However, such code is of
        course not generally portable to other compilers.)
 
        _AltiVecPIMLanguageExtensionsAreEnabled is a symbol newly in
        Accelerate/Accelerate.h as of Mac OS 10.5.  The intent is for
        it to be defined if and only if the C language extensions
        defined in the AltiVec Programming Interface Manual are
        available in the current compilation.  This is a tricky
        proposition because there is not universal agreement between
        compilers about how to indicate that.  The test used in
        Accelerate to determine that should work on several compilers.
    */
    #if defined _AltiVecPIMLanguageExtensionsAreEnabled
        MathEnvironment SetMathEnvironment(MathEnvironment New)
        {
            if (HasVector)
            {
                // Get current value of VSCR.
                MathEnvironment Old = vec_mfvscr();
 
                // Set new value of VSCR.
                vec_mtvscr(New);
 
                return Old;
            }
            else
                /*  On a machine without AltiVec, you
                    cannot set non-Java mode.
                */
                return (vUInt32) (0);
        }
    #else   // defined _AltiVecPIMLanguageExtensionsAreEnabled
        MathEnvironment SetMathEnvironment(MathEnvironment New)
        {
            return (vUInt32) (0);
        }
    #endif  // defined _AltiVecPIMLanguageExtensionsAreEnabled
 
#elif defined __i386__ || defined __x86_64__
 
    #include <fenv.h>
    #if !defined __GNUC__
        /*  This statement should be used when the compiler
            supports it.
        */
        #pragma STDC FENV_ACCESS ON
    #endif
 
    typedef fenv_t MathEnvironment;
 
    // Define FastMathEnvironment to use one provided by Apple via fenv.h.
    #define FastMathEnvironment (*FE_DFL_DISABLE_SSE_DENORMS_ENV)
 
    MathEnvironment SetMathEnvironment(MathEnvironment New)
    {
        MathEnvironment Old;
    
        // Get the old environment.
        if (0 != fegetenv(&Old))
        {
            fprintf(stderr, "Error, fegetenv returned non-zero.\n");
            exit(EXIT_FAILURE);
        }
    
        // Set the new environment.
        if (0 != fesetenv(&New))
        {
            fprintf(stderr, "Error, fesetenv returned non-zero.\n");
            exit(EXIT_FAILURE);
        }
 
        return Old;
    }
 
#else
 
    /*  On an unknown architecture, we have no control over the
        environment.
    */
    typedef _Bool MathEnvironment;
    static const MathEnvironment FastMathEnvironment = 0;
    MathEnvironment SetMathEnvironment(MathEnvironment New)
    {
        return 0;
    }
 
#endif
 
 
// Define static data for Clock routine.
static mach_timebase_info_data_t MachClockInfo;
ClockData ClockLatency; // Average latency of clock routine, in clock ticks.
 
 
// Return mach clock's time.
ClockData Clock(void)
{
    return mach_absolute_time();
}
 
 
/*  Subtract two clock measurements and convert difference to seconds,
    excluding measurement time.
*/
double ClockToSeconds(ClockData t1, ClockData t0)
{
    return (t1 - t0 - ClockLatency) * 1e-9
        * MachClockInfo.numer / MachClockInfo.denom;
}
 
 
// Initialize static data for Clock routine.
static void InitializeClock()
{
    static const int Iterations = 1000000;
 
    int i;
 
    /*  Get ratio of mach_absolute_time ticks to nanoseconds.  (One
        tick is numer/denom nanoseconds.)
    */
    mach_timebase_info(&MachClockInfo);
 
    // Measure latency of Clock routine.
    ClockData t0 = Clock(), t1;
    for (i = 0; i < Iterations; ++i)
        t1 = Clock();
 
    // Record average latency (rounded down).
    ClockLatency = (t1 - t0) / Iterations;
}
 
 
int main(void)
{
    /*  Initialize various things.  These are typically done only once,
        at the start of a program.
    */
    InitializeClock();
    InitializeHasVector();
 
    // Set the floating-point math environment for fast execution.
    MathEnvironment OldMathEnvironment
        = SetMathEnvironment(FastMathEnvironment);
 
    DemonstrateConvolution();
    DemonstrateDFT();
    DemonstrateFFT();
    DemonstrateFFT2D();
 
    /*  Restore the original math environment.  This is not necessary
        at the end of a program, but this is how you might do it in an
        application that wanted to change the environment various
        times.
    */
    SetMathEnvironment(OldMathEnvironment);
 
    return 0;
}