/*******************************************************************************
* Copyright (C) 2018 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
 *
 *  Content:
 *            Touch test for all available oneMKL VM math functions:
 *
 *            HA (High Accuracy), LA (Low Accuracy), EP (Enhanced Performance)
 *            single, double, complex precision function diffferent variants:
 *            basic, explicit mode, strided and strided with explicit mode
 *            are being called.
 *
 *******************************************************************************/

#if (defined __INTEL_LLVM_COMPILER)
#include <mathimf.h>
#else
#include <math.h>
#endif

#include <algorithm>
#include <cfloat>
#include <chrono>
#include <cmath>
#include <complex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <numeric>
#include <random>
#include <stdexcept>
#include <string>
#include <type_traits>

#include <sycl/sycl.hpp>

#include "mkl_vml.h"
#include "oneapi/mkl/vm.hpp"
#include "oneapi/mkl/vm/device/vm.hpp"

#include "common_for_examples.hpp"

/**
 * Common constants:
 */
// oneMKL VM vector length - number of inputs to be evaluated
static const int length = 1000;

// Maximum allowed ulp's (+1.0 due to rounded reference double precision function)
static const float s_allowed_ulp[] = {1.0f, 4.0f, 5000.0f};
static const float c_allowed_ulp[] = {1.0f, 4.0f, 5000.0f};
static const double d_allowed_ulp[] = {1.0 + 1.0, 4.0 + 1.0, 7.0e7};
static const double z_allowed_ulp[] = {1.0 + 1.0, 4.0 + 1.0, 7.0e7};

// Fixed argument value
static const double fixed = 3.14;

/* Enable errors printout */
static const int print_err = 1;

/* Enable warnings printout */
static const int print_warn = 0;

/* Maximum function full name length */
static const int name_len = 64;

// Maximum printed number of errors for each function to reduce log size
static const int max_printed = 1;

/**
 * Mapping to oneMKL VM accuracy mode constants:
 */
static const oneapi::mkl::vm::mode vm_mode[] = {
    oneapi::mkl::vm::mode::ha,
    oneapi::mkl::vm::mode::la,
    oneapi::mkl::vm::mode::ep,
};

/**
 * Available function accuracies:
 */
enum VmAccuracy {
    kHA = 0, // HA
    kLA,     // LA
    kEP,     // EP
    kAccNum  // Number of accuracies
};

/**
 * Available function precisions:
 */
enum VmPrecision {
    kSP = 0, // Single precision
    kDP,     // Double precision
    kCP,     // Complex single precision
    kZP,     // Complex double precision
};

/**
 * Available function argument layouts:
 */
enum VmLayout {
    kVtoV = 0, // 1 vector argument  -> 1 vector result
    kVVtoV,    // 2 vector arguments -> 1 vector result
    kVtoVV,    // 1 vector argument  -> 2 vector results
    kVXtoV,    // 1 vector and 1 fixed arguments -> 1 vector result
    kVVXtoV,   // 2 vector and 4 fixed arguments -> 1 vector result
    kVCtoVR,   // 1 complex argument  -> 1 real result
    kVRtoVC,   // 1 real argument  -> 1 complex result
};

/**
 * Input arguments:
 */
struct VmInputData {
    int has_real;    // Function has real precision
    int has_complex; // Function has complex precision

    // Host argument arrays
    float *sarg1, *sarg2;
    double *darg1, *darg2;
    std::complex<float>*carg1, *carg2;
    std::complex<double>*zarg1, *zarg2;

    // Device argument arrays
    float *dev_sarg1, *dev_sarg2;
    double *dev_darg1, *dev_darg2;
    std::complex<float>*dev_carg1, *dev_carg2;
    std::complex<double>*dev_zarg1, *dev_zarg2;

    // Host reference result arrays
    double *sref1, *sref2;
    double *dref1, *dref2;
    double *csref1, *zdref1;
    std::complex<double>* cref1;
    std::complex<double>* zref1;
};

/**
 * Output results:
 */
struct VmOutputData {
    // Number of errors and warnings for the funciton
    int serr, swarn, derr, dwarn, cerr, cwarn, zerr, zwarn;

    // Maximum observed ulps for each accuracy level
    double sulp[kAccNum], dulp[kAccNum], culp[kAccNum], zulp[kAccNum];

    // Host result arrays
    float *sres1, *sres2;
    double *dres1, *dres2;
    float* csres1;
    double* zdres1;
    std::complex<float>* cres1;
    std::complex<double>* zres1;

    // Device result arrays
    float *dev_sres1, *dev_sres2;
    double *dev_dres1, *dev_dres2;
    float* dev_csres1;
    double* dev_zdres1;
    std::complex<float>* dev_cres1;
    std::complex<double>* dev_zres1;
};

/**
 * Function pointers:
 */
// Pointers to real reference functions
typedef double (*RefVtoV)(double);
typedef double (*RefVVtoV)(double, double);
typedef void (*RefVtoVV)(double, double*, double*);
typedef double (*RefVVXtoV)(double, double, double, double, double, double);
// Pointers to complex reference functions
typedef std::complex<double> (*CRefVtoV)(std::complex<double>);
typedef double (*CRefCtoR)(std::complex<double>);
typedef std::complex<double> (*CRefRtoC)(double);
typedef std::complex<double> (*CRefVVtoV)(std::complex<double>, std::complex<double>);
typedef void (*CRefVtoVV)(std::complex<double>, std::complex<double>*, std::complex<double>*);
// Pointer to VM functions launcher
typedef void (*VmFunc)(sycl::queue&, int, VmInputData*, VmOutputData*);

/**
 * @brief VM functions family launchers
 *
 * Run all variants of VM function family
 *
 * @param[in]  q           Sycl queue
 * @param[in]  acc         Accuracy
 * @param[in]  in          Input and reference resutl arrays
 * @param[out] out         Output arrays
 *
 */
/**
 * asin
 */
static void own_vm_asin(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asin(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asin(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSasin>(length, vm_s_kernel);
    q.parallel_for<class VmDasin>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_asin
  /**
   * acos
   */
static void own_vm_acos(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acos(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acos(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSacos>(length, vm_s_kernel);
    q.parallel_for<class VmDacos>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_acos
  /**
   * atan
   */
static void own_vm_atan(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSatan>(length, vm_s_kernel);
    q.parallel_for<class VmDatan>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_atan
  /**
   * asinpi
   */
static void own_vm_asinpi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asinpi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asinpi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSasinpi>(length, vm_s_kernel);
    q.parallel_for<class VmDasinpi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_asinpi
  /**
   * acospi
   */
static void own_vm_acospi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acospi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acospi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSacospi>(length, vm_s_kernel);
    q.parallel_for<class VmDacospi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_acospi
  /**
   * atanpi
   */
static void own_vm_atanpi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atanpi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atanpi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSatanpi>(length, vm_s_kernel);
    q.parallel_for<class VmDatanpi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_atanpi
  /**
   * atan2
   */
static void own_vm_atan2(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan2(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan2(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSatan2>(length, vm_s_kernel);
    q.parallel_for<class VmDatan2>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_atan2
  /**
   * atan2pi
   */
static void own_vm_atan2pi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan2pi(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atan2pi(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSatan2pi>(length, vm_s_kernel);
    q.parallel_for<class VmDatan2pi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_atan2pi
  /**
   * asinh
   */
static void own_vm_asinh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asinh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::asinh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSasinh>(length, vm_s_kernel);
    q.parallel_for<class VmDasinh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_asinh
  /**
   * acosh
   */
static void own_vm_acosh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acosh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::acosh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSacosh>(length, vm_s_kernel);
    q.parallel_for<class VmDacosh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_acosh
  /**
   * atanh
   */
static void own_vm_atanh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atanh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::atanh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSatanh>(length, vm_s_kernel);
    q.parallel_for<class VmDatanh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_atanh
  /**
   * sin
   */
static void own_vm_sin(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sin(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sin(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsin>(length, vm_s_kernel);
    q.parallel_for<class VmDsin>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sin
  /**
   * cos
   */
static void own_vm_cos(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cos(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cos(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScos>(length, vm_s_kernel);
    q.parallel_for<class VmDcos>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cos
  /**
   * tan
   */
static void own_vm_tan(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tan(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tan(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmStan>(length, vm_s_kernel);
    q.parallel_for<class VmDtan>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_tan
  /**
   * sincos
   */
static void own_vm_sincos(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres1 = out->dev_sres1, *sres2 = out->dev_sres2;
    double *darg = in->dev_darg1, *dres1 = out->dev_dres1, *dres2 = out->dev_dres2;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sincos(sarg + i, sres1 + i, sres2 + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sincos(darg + i, dres1 + i, dres2 + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsincos>(length, vm_s_kernel);
    q.parallel_for<class VmDsincos>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sincos
  /**
   * sind
   */
static void own_vm_sind(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sind(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sind(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsind>(length, vm_s_kernel);
    q.parallel_for<class VmDsind>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sind
  /**
   * cosd
   */
static void own_vm_cosd(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cosd(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cosd(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScosd>(length, vm_s_kernel);
    q.parallel_for<class VmDcosd>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cosd
  /**
   * tand
   */
static void own_vm_tand(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tand(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tand(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmStand>(length, vm_s_kernel);
    q.parallel_for<class VmDtand>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_tand
  /**
   * sinpi
   */
static void own_vm_sinpi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sinpi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sinpi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsinpi>(length, vm_s_kernel);
    q.parallel_for<class VmDsinpi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sinpi
  /**
   * cospi
   */
static void own_vm_cospi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cospi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cospi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScospi>(length, vm_s_kernel);
    q.parallel_for<class VmDcospi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cospi
  /**
   * tanpi
   */
static void own_vm_tanpi(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tanpi(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tanpi(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmStanpi>(length, vm_s_kernel);
    q.parallel_for<class VmDtanpi>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_tanpi
  /**
   * sinh
   */
static void own_vm_sinh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sinh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sinh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsinh>(length, vm_s_kernel);
    q.parallel_for<class VmDsinh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sinh
  /**
   * cosh
   */
static void own_vm_cosh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cosh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cosh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScosh>(length, vm_s_kernel);
    q.parallel_for<class VmDcosh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cosh
  /**
   * tanh
   */
static void own_vm_tanh(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tanh(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::tanh(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmStanh>(length, vm_s_kernel);
    q.parallel_for<class VmDtanh>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_tanh
  /**
   * exp
   */
static void own_vm_exp(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;
    std::complex<float>*carg = in->dev_carg1, *cres = out->dev_cres1;
    std::complex<double>*zarg = in->dev_zarg1, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp(darg + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp(carg + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp(zarg + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSexp>(length, vm_s_kernel);
    q.parallel_for<class VmDexp>(length, vm_d_kernel);
    q.parallel_for<class VmCexp>(length, vm_c_kernel);
    q.parallel_for<class VmZexp>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_exp
  /**
   * exp2
   */
static void own_vm_exp2(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp2(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp2(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSexp2>(length, vm_s_kernel);
    q.parallel_for<class VmDexp2>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_exp2
  /**
   * exp10
   */
static void own_vm_exp10(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp10(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::exp10(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSexp10>(length, vm_s_kernel);
    q.parallel_for<class VmDexp10>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_exp10
  /**
   * expm1
   */
static void own_vm_expm1(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::expm1(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::expm1(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSexpm1>(length, vm_s_kernel);
    q.parallel_for<class VmDexpm1>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_expm1
  /**
   * ln
   */
static void own_vm_ln(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;
    std::complex<float>*carg = in->dev_carg1, *cres = out->dev_cres1;
    std::complex<double>*zarg = in->dev_zarg1, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ln(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ln(darg + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ln(carg + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ln(zarg + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSln>(length, vm_s_kernel);
    q.parallel_for<class VmDln>(length, vm_d_kernel);
    q.parallel_for<class VmCln>(length, vm_c_kernel);
    q.parallel_for<class VmZln>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_ln
  /**
   * log10
   */
static void own_vm_log10(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log10(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log10(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSlog10>(length, vm_s_kernel);
    q.parallel_for<class VmDlog10>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_log10
  /**
   * log2
   */
static void own_vm_log2(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log2(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log2(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSlog2>(length, vm_s_kernel);
    q.parallel_for<class VmDlog2>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_log2
  /**
   * log1p
   */
static void own_vm_log1p(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log1p(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::log1p(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSlog1p>(length, vm_s_kernel);
    q.parallel_for<class VmDlog1p>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_log1p
  /**
   * pow
   */
static void own_vm_pow(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSpow>(length, vm_s_kernel);
    q.parallel_for<class VmDpow>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_pow
  /**
   * powr
   */
static void own_vm_powr(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::powr(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::powr(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSpowr>(length, vm_s_kernel);
    q.parallel_for<class VmDpowr>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_powr
  /**
   * powx
   */
static void own_vm_powx(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *dres = out->dev_dres1;
    float sarg2 = (float)fixed;
    double darg2 = (double)fixed;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::powx(sarg1 + i, sarg2, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::powx(darg1 + i, darg2, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSpowx>(length, vm_s_kernel);
    q.parallel_for<class VmDpowx>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_powx
  /**
   * pow2o3
   */
static void own_vm_pow2o3(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow2o3(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow2o3(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSpow2o3>(length, vm_s_kernel);
    q.parallel_for<class VmDpow2o3>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_pow2o3
  /**
   * pow3o2
   */
static void own_vm_pow3o2(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow3o2(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::pow3o2(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSpow3o2>(length, vm_s_kernel);
    q.parallel_for<class VmDpow3o2>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_pow3o2
  /**
   * sqrt
   */
static void own_vm_sqrt(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;
    std::complex<float>*carg = in->dev_carg1, *cres = out->dev_cres1;
    std::complex<double>*zarg = in->dev_zarg1, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqrt(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqrt(darg + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqrt(carg + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqrt(zarg + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsqrt>(length, vm_s_kernel);
    q.parallel_for<class VmDsqrt>(length, vm_d_kernel);
    q.parallel_for<class VmCsqrt>(length, vm_c_kernel);
    q.parallel_for<class VmZsqrt>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sqrt
  /**
   * invsqrt
   */
static void own_vm_invsqrt(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::invsqrt(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::invsqrt(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSinvsqrt>(length, vm_s_kernel);
    q.parallel_for<class VmDinvsqrt>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_invsqrt
  /**
   * cbrt
   */
static void own_vm_cbrt(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cbrt(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cbrt(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScbrt>(length, vm_s_kernel);
    q.parallel_for<class VmDcbrt>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cbrt
  /**
   * invcbrt
   */
static void own_vm_invcbrt(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::invcbrt(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::invcbrt(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSinvcbrt>(length, vm_s_kernel);
    q.parallel_for<class VmDinvcbrt>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_invcbrt
  /**
   * hypot
   */
static void own_vm_hypot(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::hypot(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::hypot(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmShypot>(length, vm_s_kernel);
    q.parallel_for<class VmDhypot>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_hypot
  /**
   * erf
   */
static void own_vm_erf(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erf(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erf(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSerf>(length, vm_s_kernel);
    q.parallel_for<class VmDerf>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_erf
  /**
   * erfc
   */
static void own_vm_erfc(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfc(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfc(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSerfc>(length, vm_s_kernel);
    q.parallel_for<class VmDerfc>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_erfc
  /**
   * erfinv
   */
static void own_vm_erfinv(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfinv(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfinv(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSerfinv>(length, vm_s_kernel);
    q.parallel_for<class VmDerfinv>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_erfinv
  /**
   * erfcinv
   */
static void own_vm_erfcinv(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfcinv(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::erfcinv(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSerfcinv>(length, vm_s_kernel);
    q.parallel_for<class VmDerfcinv>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_erfcinv
  /**
   * cdfnorm
   */
static void own_vm_cdfnorm(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cdfnorm(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cdfnorm(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScdfnorm>(length, vm_s_kernel);
    q.parallel_for<class VmDcdfnorm>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cdfnorm
  /**
   * cdfnorminv
   */
static void own_vm_cdfnorminv(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cdfnorminv(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cdfnorminv(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScdfnorminv>(length, vm_s_kernel);
    q.parallel_for<class VmDcdfnorminv>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cdfnorminv
  /**
   * ceil
   */
static void own_vm_ceil(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ceil(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::ceil(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSceil>(length, vm_s_kernel);
    q.parallel_for<class VmDceil>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_ceil
  /**
   * floor
   */
static void own_vm_floor(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::floor(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::floor(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfloor>(length, vm_s_kernel);
    q.parallel_for<class VmDfloor>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_floor
  /**
   * round
   */
static void own_vm_round(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::round(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::round(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSround>(length, vm_s_kernel);
    q.parallel_for<class VmDround>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_round
  /**
   * trunc
   */
static void own_vm_trunc(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::trunc(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::trunc(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmStrunc>(length, vm_s_kernel);
    q.parallel_for<class VmDtrunc>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_trunc
  /**
   * rint
   */
static void own_vm_rint(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::rint(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::rint(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSrint>(length, vm_s_kernel);
    q.parallel_for<class VmDrint>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_rint
  /**
   * nearbyint
   */
static void own_vm_nearbyint(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::nearbyint(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::nearbyint(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSnearbyint>(length, vm_s_kernel);
    q.parallel_for<class VmDnearbyint>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_nearbyint
  /**
   * remainder
   */
static void own_vm_remainder(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::remainder(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::remainder(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSremainder>(length, vm_s_kernel);
    q.parallel_for<class VmDremainder>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_remainder
  /**
   * add
   */
static void own_vm_add(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;
    std::complex<float>*carg1 = in->dev_carg1, *carg2 = in->dev_carg2, *cres = out->dev_cres1;
    std::complex<double>*zarg1 = in->dev_zarg1, *zarg2 = in->dev_zarg2, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::add(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::add(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::add(carg1 + i, carg2 + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::add(zarg1 + i, zarg2 + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSadd>(length, vm_s_kernel);
    q.parallel_for<class VmDadd>(length, vm_d_kernel);
    q.parallel_for<class VmCadd>(length, vm_c_kernel);
    q.parallel_for<class VmZadd>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_add
  /**
   * sub
   */
static void own_vm_sub(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;
    std::complex<float>*carg1 = in->dev_carg1, *carg2 = in->dev_carg2, *cres = out->dev_cres1;
    std::complex<double>*zarg1 = in->dev_zarg1, *zarg2 = in->dev_zarg2, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sub(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sub(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sub(carg1 + i, carg2 + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sub(zarg1 + i, zarg2 + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsub>(length, vm_s_kernel);
    q.parallel_for<class VmDsub>(length, vm_d_kernel);
    q.parallel_for<class VmCsub>(length, vm_c_kernel);
    q.parallel_for<class VmZsub>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sub
  /**
   * mul
   */
static void own_vm_mul(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;
    std::complex<float>*carg1 = in->dev_carg1, *carg2 = in->dev_carg2, *cres = out->dev_cres1;
    std::complex<double>*zarg1 = in->dev_zarg1, *zarg2 = in->dev_zarg2, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mul(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mul(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mul(carg1 + i, carg2 + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mul(zarg1 + i, zarg2 + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSmul>(length, vm_s_kernel);
    q.parallel_for<class VmDmul>(length, vm_d_kernel);
    q.parallel_for<class VmCmul>(length, vm_c_kernel);
    q.parallel_for<class VmZmul>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_mul
  /**
   * div
   */
static void own_vm_div(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;
    std::complex<float>*carg1 = in->dev_carg1, *carg2 = in->dev_carg2, *cres = out->dev_cres1;
    std::complex<double>*zarg1 = in->dev_zarg1, *zarg2 = in->dev_zarg2, *zres = out->dev_zres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::div(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::div(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::div(carg1 + i, carg2 + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::div(zarg1 + i, zarg2 + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSdiv>(length, vm_s_kernel);
    q.parallel_for<class VmDdiv>(length, vm_d_kernel);
    q.parallel_for<class VmCdiv>(length, vm_c_kernel);
    q.parallel_for<class VmZdiv>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_div
  /**
   * sqr
   */
static void own_vm_sqr(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqr(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::sqr(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSsqr>(length, vm_s_kernel);
    q.parallel_for<class VmDsqr>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_sqr
  /**
   * inv
   */
static void own_vm_inv(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::inv(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::inv(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSinv>(length, vm_s_kernel);
    q.parallel_for<class VmDinv>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_inv
  /**
   * modf
   */
static void own_vm_modf(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres1 = out->dev_sres1, *sres2 = out->dev_sres2;
    double *darg = in->dev_darg1, *dres1 = out->dev_dres1, *dres2 = out->dev_dres2;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::modf(sarg + i, sres1 + i, sres2 + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::modf(darg + i, dres1 + i, dres2 + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSmodf>(length, vm_s_kernel);
    q.parallel_for<class VmDmodf>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_modf
  /**
   * fmod
   */
static void own_vm_fmod(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmod(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmod(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfmod>(length, vm_s_kernel);
    q.parallel_for<class VmDfmod>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_fmod
  /**
   * fdim
   */
static void own_vm_fdim(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fdim(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fdim(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfdim>(length, vm_s_kernel);
    q.parallel_for<class VmDfdim>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_fdim
  /**
   * fmax
   */
static void own_vm_fmax(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmax(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmax(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfmax>(length, vm_s_kernel);
    q.parallel_for<class VmDfmax>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_fmax
  /**
   * fmin
   */
static void own_vm_fmin(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmin(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::fmin(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfmin>(length, vm_s_kernel);
    q.parallel_for<class VmDfmin>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_fmin
  /**
   * maxmag
   */
static void own_vm_maxmag(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::maxmag(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::maxmag(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSmaxmag>(length, vm_s_kernel);
    q.parallel_for<class VmDmaxmag>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_maxmag
  /**
   * minmag
   */
static void own_vm_minmag(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::minmag(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::minmag(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSminmag>(length, vm_s_kernel);
    q.parallel_for<class VmDminmag>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_minmag
  /**
   * nextafter
   */
static void own_vm_nextafter(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::nextafter(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::nextafter(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSnextafter>(length, vm_s_kernel);
    q.parallel_for<class VmDnextafter>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_nextafter
  /**
   * copysign
   */
static void own_vm_copysign(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg1 = in->dev_sarg1, *sarg2 = in->dev_sarg2, *sres = out->dev_sres1;
    double *darg1 = in->dev_darg1, *darg2 = in->dev_darg2, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::copysign(sarg1 + i, sarg2 + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::copysign(darg1 + i, darg2 + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmScopysign>(length, vm_s_kernel);
    q.parallel_for<class VmDcopysign>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_copysign
  /**
   * frac
   */
static void own_vm_frac(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float *sarg = in->dev_sarg1, *sres = out->dev_sres1;
    double *darg = in->dev_darg1, *dres = out->dev_dres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::frac(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::frac(darg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSfrac>(length, vm_s_kernel);
    q.parallel_for<class VmDfrac>(length, vm_d_kernel);

    q.wait_and_throw();
    return;
} // own_vm_frac
  /**
   * conj
   */
static void own_vm_conj(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    std::complex<float>*carg = in->dev_carg1, *cres = out->dev_cres1;
    std::complex<double>*zarg = in->dev_zarg1, *zres = out->dev_zres1;

    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::conj(carg + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::conj(zarg + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmCconj>(length, vm_c_kernel);
    q.parallel_for<class VmZconj>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_conj
  /**
   * mulbyconj
   */
static void own_vm_mulbyconj(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    std::complex<float>*carg1 = in->dev_carg1, *carg2 = in->dev_carg2, *cres = out->dev_cres1;
    std::complex<double>*zarg1 = in->dev_zarg1, *zarg2 = in->dev_zarg2, *zres = out->dev_zres1;

    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mulbyconj(carg1 + i, carg2 + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::mulbyconj(zarg1 + i, zarg2 + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmCmulbyconj>(length, vm_c_kernel);
    q.parallel_for<class VmZmulbyconj>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_mulbyconj
  /**
   * cis
   */
static void own_vm_cis(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float* sarg = in->dev_sarg1;
    double* darg = in->dev_darg1;
    std::complex<float>* cres = out->dev_cres1;
    std::complex<double>* zres = out->dev_zres1;

    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cis(sarg + i, cres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::cis(darg + i, zres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmCcis>(length, vm_c_kernel);
    q.parallel_for<class VmZcis>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_cis
  /**
   * arg
   */
static void own_vm_arg(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    std::complex<float>* carg = in->dev_carg1;
    std::complex<double>* zarg = in->dev_zarg1;
    float* sres = out->dev_csres1;
    double* dres = out->dev_zdres1;

    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::arg(carg + i, sres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::arg(zarg + i, dres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmCarg>(length, vm_c_kernel);
    q.parallel_for<class VmZarg>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_arg
  /**
   * abs
   */
static void own_vm_abs(sycl::queue& q, int acc, VmInputData* in, VmOutputData* out) {
    float* sarg = in->dev_sarg1;
    double* darg = in->dev_darg1;
    std::complex<float>* carg = in->dev_carg1;
    std::complex<double>* zarg = in->dev_zarg1;
    float* sres = out->dev_sres1;
    double* dres = out->dev_dres1;
    float* csres = out->dev_csres1;
    double* zdres = out->dev_zdres1;

    auto vm_s_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::abs(sarg + i, sres + i, vm_mode[acc]);
    };
    auto vm_d_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::abs(darg + i, dres + i, vm_mode[acc]);
    };
    auto vm_c_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::abs(carg + i, csres + i, vm_mode[acc]);
    };
    auto vm_z_kernel = [=](sycl::id<1> id) {
        size_t i = id.get(0);
        oneapi::mkl::vm::device::abs(zarg + i, zdres + i, vm_mode[acc]);
    };

    q.parallel_for<class VmSabs>(length, vm_s_kernel);
    q.parallel_for<class VmDabs>(length, vm_d_kernel);
    q.parallel_for<class VmCabs>(length, vm_c_kernel);
    q.parallel_for<class VmZabs>(length, vm_z_kernel);

    q.wait_and_throw();
    return;
} // own_vm_abs

/**
 * Custom reference functions absent in LIBM:
 */
// Real functions
static double own_frac(double x) { return (x - std::trunc(x)); }                               /* signed fractional part */
static double own_invcbrt(double x) { return std::pow(x, -(1.0 / 3.0)); }                      /* inverse cube root */
static double own_pow2o3(double x) { return std::pow(x, 2.0 / 3.0); }                          /* power x^(2/3) */
static double own_pow3o2(double x) { return std::pow(x, 3.0 / 2.0); }                          /* power x^(3/2) */
static double own_inv(double x) { return 1.0 / x; }                                            /* inverse 1/x */
static double own_sqr(double x) { return x * x; }                                              /* square x*x */
static double own_add(double x, double y) { return x + y; }                                    /* addition x+y */
static double own_sub(double x, double y) { return x - y; }                                    /* subtraction x-y */
static double own_mul(double x, double y) { return x * y; }                                    /* multiplication x-y */
static double own_div(double x, double y) { return x / y; }                                    /* division x/y */
static double own_minmag(double x, double y) { return (std::fabs(x) < std::fabs(y)) ? x : y; } /* min(|x|,|y|) */
static double own_maxmag(double x, double y) { return (std::fabs(x) > std::fabs(y)) ? x : y; } /* max(|x|,|y|) */
static void own_modf(double x, double* r1, double* r2) {
    *r2 = std::modf(x, r1);
} /* truncated integer value |x| and the remaining fraction part x-|x| */
static double own_lfrac(double x, double y, double a, double b, double c, double d) {
    return (x * a + b) / (y * c + d);
} /* linear fraction (x*a + b)/(y*c + d)*/
static double own_expint1(double x) {
    double r;
    vmdExpInt1(1, &x, &r, VML_HA);
    return r;
} /* exponential integral - exists in VM only */
static void own_sincos(double x, double* r1, double* r2) {
    *r1 = std::sin(x);
    *r2 = std::cos(x);
    return;
} /* sin & cos */
static double own_invsqrt(double x) { return 1.0 / std::sqrt(x); } /* 1/sqrt(x) */

// Complex functions
static std::complex<double> own_cacos(std::complex<double> x) { return std::acos(x); }                 /* complex acos */
static std::complex<double> own_casin(std::complex<double> x) { return std::asin(x); }                 /* complex asin */
static std::complex<double> own_catan(std::complex<double> x) { return std::atan(x); }                 /* complex atan */
static std::complex<double> own_cacosh(std::complex<double> x) { return std::acosh(x); }               /* complex acosh */
static std::complex<double> own_casinh(std::complex<double> x) { return std::asinh(x); }               /* complex asinh */
static std::complex<double> own_catanh(std::complex<double> x) { return std::atanh(x); }               /* complex atanh */
static std::complex<double> own_ccos(std::complex<double> x) { return std::cos(x); }                   /* complex cos */
static std::complex<double> own_csin(std::complex<double> x) { return std::sin(x); }                   /* complex sin */
static std::complex<double> own_ctan(std::complex<double> x) { return std::tan(x); }                   /* complex tan */
static std::complex<double> own_ccosh(std::complex<double> x) { return std::cosh(x); }                 /* complex cosh */
static std::complex<double> own_csinh(std::complex<double> x) { return std::sinh(x); }                 /* complex sinh */
static std::complex<double> own_ctanh(std::complex<double> x) { return std::tanh(x); }                 /* complex tanh */
static std::complex<double> own_cexp(std::complex<double> x) { return std::exp(x); }                   /* complex exp */
static std::complex<double> own_clog(std::complex<double> x) { return std::log(x); }                   /* complex log */
static std::complex<double> own_clog10(std::complex<double> x) { return std::log10(x); }               /* complex log10 */
static std::complex<double> own_conj(std::complex<double> x) { return std::conj(x); }                  /* complex x*conj(y) */
static std::complex<double> own_csqrt(std::complex<double> x) { return std::sqrt(x); }                 /* complex sqrt */
static std::complex<double> own_cadd(std::complex<double> x, std::complex<double> y) { return x + y; } /* complex x+y */
static std::complex<double> own_csub(std::complex<double> x, std::complex<double> y) { return x - y; } /* complex x+y */
static std::complex<double> own_cmul(std::complex<double> x, std::complex<double> y) { return x * y; } /* complex x*y */
static std::complex<double> own_cdiv(std::complex<double> x, std::complex<double> y) { return x / y; } /* complex x/y */
static std::complex<double> own_cpow(std::complex<double> x, std::complex<double> y) { return std::pow(x, y); } /* complex pow */
static std::complex<double> own_cmulbyconj(std::complex<double> x, std::complex<double> y) {
    return x * std::conj(y);
} /* complex x*conj(y) */
static std::complex<double> own_cis(double x) {
    double r1, r2;
    own_sincos(x, &r2, &r1);
    std::complex<double> r = {r1, r2};
    return r;
} /* complex CIS (sincos) */
static double own_cabs(std::complex<double> x) { return std::hypot(x.real(), x.imag()); } /* complex |x| */
static double own_carg(std::complex<double> x) { return std::atan2(x.imag(), x.real()); } /* complex argument (atan2) */

/**
 * @brief Testing preamble.
 *
 * Print device aand driver info.
 *
 * @param[in] dev    Sycl device
 *
 */
void own_preamble(sycl::device& dev) {
    std::string dev_name = dev.template get_info<sycl::info::device::name>();
    std::string driver_version = dev.template get_info<sycl::info::device::version>();
    fprintf(stdout, "\t       device name: %s\n", dev_name.c_str());
    fprintf(stdout, "\t    driver version: %s\n\n", driver_version.c_str());
    fflush(stdout);
    return;
}

/**
 * @brief Asynchronous error handler.
 *
 * Async sycl error catching procedure.
 *
 * @param[in] el Exceptions list
 *
 */
void own_async_sycl_error(sycl::exception_list el) {
    fprintf(stderr, "async exceptions caught: \n");

    for (auto l = el.begin(); l != el.end(); l = l + 1) {
        try {
            std::rethrow_exception(*l);
        } catch (const sycl::exception& e) {
            fprintf(stderr, "SYCL exception occured with code %d with %s\n", e.code().value(), e.what());
        }
    }
    return;
} // own_async_sycl_error

/**
 * @brief Allocate all input and reference result arrays
 *
 * Allocation of input and reference result arrays memory
 *
 * @param[in]      q        Sycl queue
 * @param[in, out] in       Input and reference resutl arrays
 * @param[in, out] out      Output arrays
 *
 */
static void own_allocate_data(sycl::queue& q, VmInputData* in, VmOutputData* out) {
    size_t len = static_cast<size_t>(length);
    // Allocate host argument arrays
    in->sarg1 = new float[len];
    in->sarg2 = new float[len];
    in->darg1 = new double[len];
    in->darg2 = new double[len];
    in->carg1 = new std::complex<float>[len];
    in->carg2 = new std::complex<float>[len];
    in->zarg1 = new std::complex<double>[len];
    in->zarg2 = new std::complex<double>[len];
    // Allocate host reference resutl arrays
    in->sref1 = new double[len];
    in->sref2 = new double[len];
    in->dref1 = new double[len];
    in->dref2 = new double[len];
    in->csref1 = new double[len];
    in->zdref1 = new double[len];
    in->cref1 = new std::complex<double>[len];
    in->zref1 = new std::complex<double>[len];
    // Allocate device argument arrays
    in->dev_sarg1 = sycl::malloc_device<float>(len, q);
    in->dev_sarg2 = sycl::malloc_device<float>(len, q);
    in->dev_darg1 = sycl::malloc_device<double>(len, q);
    in->dev_darg2 = sycl::malloc_device<double>(len, q);
    in->dev_carg1 = sycl::malloc_device<std::complex<float>>(len, q);
    in->dev_carg2 = sycl::malloc_device<std::complex<float>>(len, q);
    in->dev_zarg1 = sycl::malloc_device<std::complex<double>>(len, q);
    in->dev_zarg2 = sycl::malloc_device<std::complex<double>>(len, q);
    // Allocate host result arrays
    out->sres1 = new float[len];
    out->sres2 = new float[len];
    out->dres1 = new double[len];
    out->dres2 = new double[len];
    out->csres1 = new float[len];
    out->zdres1 = new double[len];
    out->cres1 = new std::complex<float>[len];
    out->zres1 = new std::complex<double>[len];

    out->dev_sres1 = sycl::malloc_device<float>(len, q);
    out->dev_sres2 = sycl::malloc_device<float>(len, q);
    out->dev_dres1 = sycl::malloc_device<double>(len, q);
    out->dev_dres2 = sycl::malloc_device<double>(len, q);
    out->dev_csres1 = sycl::malloc_device<float>(len, q);
    out->dev_zdres1 = sycl::malloc_device<double>(len, q);
    out->dev_cres1 = sycl::malloc_device<std::complex<float>>(len, q);
    out->dev_zres1 = sycl::malloc_device<std::complex<double>>(len, q);

    return;
} // own_allocate_data

/**
 * @brief Deallocate all input and reference result arrays
 *
 * Deallocation of input and reference result arrays memory
 *
 * @param[in]      q        Sycl queue
 * @param[in, out] in       Input and reference resutl arrays
 * @param[in, out] out      Output arrays
 *
 */
static void own_deallocate_data(sycl::queue& q, VmInputData* in, VmOutputData* out) {
    // Free host argument arrays
    delete in->sarg1;
    delete in->sarg2;
    delete in->darg1;
    delete in->darg2;
    delete in->carg1;
    delete in->carg2;
    delete in->zarg1;
    delete in->zarg2;
    // Free host reference result arrays
    delete in->sref1;
    delete in->sref2;
    delete in->dref1;
    delete in->dref2;
    delete in->csref1;
    delete in->zdref1;
    delete in->cref1;
    delete in->zref1;
    // Free device argument arrays
    sycl::free(in->dev_sarg1, q);
    sycl::free(in->dev_sarg2, q);
    sycl::free(in->dev_darg1, q);
    sycl::free(in->dev_darg2, q);
    sycl::free(in->dev_carg1, q);
    sycl::free(in->dev_carg2, q);
    sycl::free(in->dev_zarg1, q);
    sycl::free(in->dev_zarg2, q);
    // Free host result arrays
    delete out->sres1;
    delete out->sres2;
    delete out->dres1;
    delete out->dres2;
    delete out->csres1;
    delete out->zdres1;
    delete out->cres1;
    delete out->zres1;

    return;
} // own_deallocate_data

/**
 * @brief Allowed accuracy ulp
 *
 * Return allowed ulp for each accuracy level (specialization by precisions)
 *
 * @param[in] acc          Accuracy
 * @return                 Allowed ulp
 *
 */
template <typename T> struct FalseType : std::false_type {};
template <typename T> double own_get_allowed_ulp(int acc) {
    static_assert(FalseType<T>::value, "No implementation of own_get_allowed_ulp for this type");
    return 0;
}
template <> double own_get_allowed_ulp<float>(int acc) { return s_allowed_ulp[acc]; }
template <> double own_get_allowed_ulp<double>(int acc) { return d_allowed_ulp[acc]; }
template <> double own_get_allowed_ulp<std::complex<float>>(int acc) { return c_allowed_ulp[acc]; }
template <> double own_get_allowed_ulp<std::complex<double>>(int acc) { return z_allowed_ulp[acc]; }

/**
 * @brief Fill inputs
 *
 * Fills input vectors by random numbers
 *
 * @param[in]  q        Sycl queue
 * @param[in]  beg      Begin of input ranges
 * @param[in]  end      End of input ranges
 * @param[out] in       Input arrays
 *
 */
static void own_fill_input(sycl::queue& q, double beg, double end, VmInputData* in) {
    size_t len = static_cast<size_t>(length);

    // Initialize random generator by seed value
    std::mt19937 eng(777);
    std::uniform_real_distribution<double> distr(beg, end);

    for (int i = 0; i < len; i = i + 1) {
        /**
         * Generate random numbers in [beg, end) range
         */
        double v1 = distr(eng);
        double v2 = distr(eng);
        double v3 = distr(eng);
        double v4 = distr(eng);

        in->darg1[i] = v1;
        in->darg2[i] = v2;
        in->sarg1[i] = (float)v1;
        in->sarg2[i] = (float)v2;
        in->zarg1[i] = {v1, v3};
        in->zarg2[i] = {v2, v4};
        in->carg1[i] = {(float)v1, (float)v3};
        in->carg2[i] = {(float)v2, (float)v4};
    }

    // Fill span device arrays by generated random values
    q.memcpy(in->dev_sarg1, in->sarg1, len * sizeof(float));
    q.memcpy(in->dev_sarg2, in->sarg2, len * sizeof(float));
    q.memcpy(in->dev_darg1, in->darg1, len * sizeof(double));
    q.memcpy(in->dev_darg2, in->darg2, len * sizeof(double));
    q.memcpy(in->dev_carg1, in->carg1, len * sizeof(std::complex<float>));
    q.memcpy(in->dev_carg2, in->carg2, len * sizeof(std::complex<float>));
    q.memcpy(in->dev_zarg1, in->zarg1, len * sizeof(std::complex<double>));
    q.memcpy(in->dev_zarg2, in->zarg2, len * sizeof(std::complex<double>));

    q.wait_and_throw();
    return;
} // own_fill_input

/**
 * @brief Fill outputs by constants
 *
 * Initialize output vectors by constants
 *
 * @param[in]  q        Sycl queue
 * @param[out] out      Output arrays
 *
 */
static void own_fill_output(sycl::queue& q, VmOutputData* out) {
    size_t len = static_cast<size_t>(length);

    // Initialize host result arrays by constants
    for (int i = 0; i < len; i = i + 1) {
        out->sres1[i] = 7777.7777;
        out->sres2[i] = 7777.7777;
        out->dres1[i] = 7777.7777;
        out->dres2[i] = 7777.7777;
        out->csres1[i] = 7777.7777;
        out->zdres1[i] = 7777.7777;
        out->cres1[i] = {7777.7777, 8888.888};
        out->zres1[i] = {7777.7777, 8888.888};
    }
    // Copy initializing constants to device arrays
    q.memcpy(out->dev_sres1, out->sres1, len * sizeof(float));
    q.memcpy(out->dev_sres2, out->sres2, len * sizeof(float));
    q.memcpy(out->dev_dres1, out->dres1, len * sizeof(double));
    q.memcpy(out->dev_dres2, out->dres2, len * sizeof(double));
    q.memcpy(out->dev_csres1, out->csres1, len * sizeof(float));
    q.memcpy(out->dev_zdres1, out->zdres1, len * sizeof(double));
    q.memcpy(out->dev_cres1, out->cres1, len * sizeof(std::complex<float>));
    q.memcpy(out->dev_zres1, out->zres1, len * sizeof(std::complex<double>));
    // Set maximum observed ulps to zero
    for (int a = kHA; a < kAccNum; a = a + 1) {
        out->sulp[a] = 0;
        out->dulp[a] = 0;
        out->culp[a] = 0;
        out->zulp[a] = 0;
    }
    // Set number of errors to zero
    out->serr = 0;
    out->swarn = 0;
    out->derr = 0;
    out->dwarn = 0;
    out->cerr = 0;
    out->cwarn = 0;
    out->zerr = 0;
    out->zwarn = 0;

    q.wait_and_throw();
    return;
} // own_fill_output

/**
 * @brief Transfer results to host
 *
 * Download results from device arrays to host
 *
 * @param[in]  q        Sycl queue
 * @param[out] out      Output arrays
 *
 */
static void own_transfer_to_host(sycl::queue& q, VmOutputData* out) {
    size_t len = static_cast<size_t>(length);
    q.memcpy(out->sres1, out->dev_sres1, len * sizeof(float));
    q.memcpy(out->sres2, out->dev_sres2, len * sizeof(float));
    q.memcpy(out->dres1, out->dev_dres1, len * sizeof(double));
    q.memcpy(out->dres2, out->dev_dres2, len * sizeof(double));
    q.memcpy(out->csres1, out->dev_csres1, len * sizeof(float));
    q.memcpy(out->zdres1, out->dev_zdres1, len * sizeof(double));
    q.memcpy(out->cres1, out->dev_cres1, len * sizeof(std::complex<float>));
    q.memcpy(out->zres1, out->dev_zres1, len * sizeof(std::complex<double>));

    q.wait_and_throw();
    return;
} // own_transfer_to_host

/**
 * @brief Fill reference vectors
 *
 * Compute reference results for different VM argument layouts (overloaded by layouts)
 *
 * @param[in]  layout      Function arguments layout
 * @param[in]  reffunc     Real reference function
 * @param[in]  creffunc    Complex reference function
 * @param[out] in          Input and reference results arrays
 *
 */
// V_V layout (majority of functions)
static void own_fill_reference(int layout, RefVtoV reffunc, CRefVtoV creffunc, VmInputData* in) {
    in->has_real = (reffunc != NULL);
    in->has_complex = (creffunc != NULL);

    for (int i = 0; i < length; i = i + 1) {
        if (in->has_real) {
            in->sref1[i] = ((RefVtoV)reffunc)(in->sarg1[i]);
            in->dref1[i] = ((RefVtoV)reffunc)(in->darg1[i]);
        }
        if (in->has_complex) {
            in->cref1[i] = ((CRefVtoV)creffunc)((std::complex<double>)(in->carg1[i]));
            in->zref1[i] = ((CRefVtoV)creffunc)(in->zarg1[i]);
        }
    }
} // own_fill_reference
// VV_V layout (2 arguments atan2, pow...)
static void own_fill_reference(int layout, RefVVtoV reffunc, CRefVVtoV creffunc, VmInputData* in) {
    in->has_real = (reffunc != NULL);
    in->has_complex = (creffunc != NULL);

    for (int i = 0; i < length; i = i + 1) {
        if (layout == kVXtoV) {
            float sfixed = (float)fixed;
            double dfixed = (double)fixed;
            if (in->has_real) {
                in->sref1[i] = ((RefVVtoV)reffunc)(in->sarg1[i], sfixed);
                in->dref1[i] = ((RefVVtoV)reffunc)(in->darg1[i], dfixed);
            }
            if (in->has_complex) {
                std::complex<double> cfixed = {sfixed, sfixed};
                std::complex<double> zfixed = {dfixed, dfixed};
                in->cref1[i] = ((CRefVVtoV)creffunc)((std::complex<double>)(in->carg1[i]), cfixed);
                in->zref1[i] = ((CRefVVtoV)creffunc)(in->zarg1[i], zfixed);
            }
        } else {
            if (in->has_real) {
                in->sref1[i] = ((RefVVtoV)reffunc)(in->sarg1[i], in->sarg2[i]);
                in->dref1[i] = ((RefVVtoV)reffunc)(in->darg1[i], in->darg2[i]);
            }
            if (in->has_complex) {
                in->cref1[i] = ((CRefVVtoV)creffunc)((std::complex<double>)(in->carg1[i]), (std::complex<double>)(in->carg2[i]));
                in->zref1[i] = ((CRefVVtoV)creffunc)(in->zarg1[i], in->zarg2[i]);
            }
        }
    }
} // own_fill_reference
// V_VV layout (2 results sincos, modf)
static void own_fill_reference(int layout, RefVtoVV reffunc, CRefVtoVV creffunc, VmInputData* in) {
    in->has_real = 1;
    in->has_complex = 0;

    for (int i = 0; i < length; i = i + 1) {
        ((RefVtoVV)reffunc)(in->sarg1[i], &(in->sref1[i]), &(in->sref2[i]));
        ((RefVtoVV)reffunc)(in->darg1[i], &(in->dref1[i]), &(in->dref2[i]));
    }
} // own_fill_reference
// VVX_V layout (linearfrac)
static void own_fill_reference(int layout, RefVVXtoV reffunc, CRefVtoV creffunc, VmInputData* in) {
    in->has_real = 1;
    in->has_complex = 0;

    for (int i = 0; i < length; i = i + 1) {
        float sfixed = (float)fixed;
        double dfixed = (double)fixed;
        in->sref1[i] = ((RefVVXtoV)reffunc)(in->sarg1[i], in->sarg2[i], sfixed, sfixed, sfixed, sfixed);
        in->dref1[i] = ((RefVVXtoV)reffunc)(in->darg1[i], in->darg2[i], dfixed, dfixed, dfixed, dfixed);
    }
} // own_fill_reference
// VC_VR layout (complex to real abs, arg)
static void own_fill_reference(int layout, RefVtoV reffunc, CRefCtoR creffunc, VmInputData* in) {
    in->has_real = (reffunc != NULL);
    in->has_complex = (creffunc != NULL);

    for (int i = 0; i < length; i = i + 1) {
        if (in->has_real) {
            in->sref1[i] = ((RefVtoV)reffunc)(in->sarg1[i]);
            in->dref1[i] = ((RefVtoV)reffunc)(in->darg1[i]);
        }
        if (in->has_complex) {
            /* Use complex array containers to accept real results */
            in->csref1[i] = ((CRefCtoR)creffunc)(in->carg1[i]);
            in->zdref1[i] = ((CRefCtoR)creffunc)(in->zarg1[i]);
        }
    }
} // own_fill_reference
// VR_VC layout (real to complex CIS)
static void own_fill_reference(int layout, RefVtoV reffunc, CRefRtoC creffunc, VmInputData* in) {
    in->has_real = 0;
    in->has_complex = 1;

    for (int i = 0; i < length; i = i + 1) {
        in->cref1[i] = ((CRefRtoC)creffunc)(in->sarg1[i]);
        in->zref1[i] = ((CRefRtoC)creffunc)(in->darg1[i]);
    }
} // own_fill_reference

/**
 * @brief Full VM function name for printout
 *
 * Construct full VM function name with precision, api and accuracy suffices
 *
 * @param[out] buff        Pointer to output string buffer
 * @param[in] maxlen       Maximum string length
 * @param[in] fname        Base function name
 * @param[in] prec         Precision
 * @param[in] api          API variant
 * @param[in] acc          Accuracy
 * @return                 Pointer to constructed name
 *
 */
static char* own_full_name(char* buff, int maxlen, const char* fname, int prec, int acc) {
    const char* prec_suffix[] = {"s", "d", "c", "z"};
    const char* acc_suffix[] = {"HA", "LA", "EP"};

    snprintf(buff, maxlen, "%s%s_%s ", prec_suffix[prec], fname, acc_suffix[acc]);

    return buff;
} // own_full_name

/**
 * @brief ULP calculation
 *
 * Computes ULP between result and reference value (overloaded by precisions)
 *
 * @param[in] res          Computed result
 * @param[in] ref          Reference result
 * @return                 Calculated ULP
 *
 */
// float
static double own_compute_ulp(float res, double ref) {
    int ex = 0;
    double den = 1.0;
    double curulp = 0.0;
    if (std::isfinite(ref)) {
        std::frexp(ref, &ex);                              // ex: integral power of two of ref
        den = std::ldexp(1.0, ex - 24);                    // den: ulp's denominator 2^(ex-p+1)
        den = (den == 0.0) ? 0x1.p-149 : den;              // if den=0 then replace by EPS to avoid divbyzero
        curulp = std::fabs((((double)(res)-ref)) / den);   // |res-ref|/2^(ex-24)
        curulp = std::isfinite(curulp) ? curulp : DBL_MAX; // replace infinite ulp by big finite number
    } else {
        if (std::fpclassify(res) == (std::fpclassify(ref))) {
            curulp = 0;
        } else {
            curulp = DBL_MAX;
        }
    }
    return curulp;
} // own_compute_ulp
// complex float
static double own_compute_ulp(std::complex<float> res, std::complex<double> ref) {
    return std::fmax(own_compute_ulp(res.real(), ref.real()), own_compute_ulp(res.imag(), ref.imag()));
} // own_compute_ulp
// double
static double own_compute_ulp(double res, double ref) {
    int ex = 0;
    double den = 1.0;
    double curulp = 0.0;
    if (std::isfinite(ref)) {
        std::frexp(ref, &ex);                              // ex: integral power of two of ref
        den = std::ldexp(1.0, ex - 53);                    // den: ulp's denominator 2^(ex-p+1)
        den = (den == 0.0) ? 0x1.p-1074 : den;             // if den=0 then replace by EPS to avoid divbyzero
        curulp = std::fabs((((double)(res)-ref)) / den);   // |res-ref|/2^(ex-53)
        curulp = std::isfinite(curulp) ? curulp : DBL_MAX; // replace infinite ulp by big finite number
    } else {
        if (std::fpclassify(res) == (std::fpclassify(ref))) {
            curulp = 0;
        } else {
            curulp = DBL_MAX;
        }
    }
    return curulp;
} // own_ompute_ulp
// complex double
static double own_compute_ulp(std::complex<double> res, std::complex<double> ref) {
    return std::fmax(own_compute_ulp(res.real(), ref.real()), own_compute_ulp(res.imag(), ref.imag()));
} // own_compute_ulp

/**
 * @brief Printout ULP value
 *
 * Print arguments, results and ULP difference (with specialization by precisions)
 *
 * @param[in]  fname       Function name
 * @param[in]  layout      Function arguments layout
 * @param[in]  acc         Function accuracy
 * @param[in]  idx         Vector index
 * @param[in]  ulp         ULP result
 * @param[in]  in          Input and reference resutl arrays
 * @param[out] out         Output arrays
 *
 */
template <typename T>
void own_print_ulp(const char* fname, int layout, int acc, int idx, double ulp, VmInputData* in, VmOutputData* out) {
    static_assert(FalseType<T>::value, "No implementation of own_print_ulp for this type");
    return;
}
// float
template <>
void own_print_ulp<float>(const char* fname, int layout, int acc, int idx, double ulp, VmInputData* in, VmOutputData* out) {
    char strbuff[name_len] = {0};
    fprintf(stderr, "\t\tULP_OVER_BOUND: %s[%d](", own_full_name(strbuff, name_len, fname, kSP, acc), idx);
    fprintf(stderr, "%.2g {%a}", in->sarg1[idx], in->sarg1[idx]);
    if ((layout == kVVtoV) || (layout == kVVXtoV))
        fprintf(stderr, ", %.2g {%a}", in->sarg2[idx], in->sarg2[idx]);
    fprintf(stderr, ") = %.2g {%a}", out->sres1[idx], out->sres1[idx]);
    if (layout == kVtoVV)
        fprintf(stderr, ", %.2g {%a}", out->sres2[idx], out->sres2[idx]);
    fprintf(stderr, ", expected = %.3lg {%la}", in->sref1[idx], in->sref1[idx]);
    if (layout == kVtoVV)
        fprintf(stderr, ", %.3lg {%la}", in->sref2[idx], in->sref2[idx]);
    fprintf(stderr, ", ulp = %.3lg\n", ulp);
    fflush(stderr);
    return;
} // own_print_ulp
// double
template <>
void own_print_ulp<double>(const char* fname, int layout, int acc, int idx, double ulp, VmInputData* in, VmOutputData* out) {
    char strbuff[name_len] = {0};
    fprintf(stderr, "\t\tULP_OVER_BOUND: %s[%d](", own_full_name(strbuff, name_len, fname, kDP, acc), idx);
    fprintf(stderr, "%.3lg {%la}", in->darg1[idx], in->darg1[idx]);
    if ((layout == kVVtoV) || (layout == kVVXtoV))
        fprintf(stderr, ", %.3lg {%la}", in->darg2[idx], in->darg2[idx]);
    fprintf(stderr, ") = %.3lg {%la}", out->dres1[idx], out->dres1[idx]);
    if (layout == kVtoVV)
        fprintf(stderr, ", %.3lg {%la}", out->dres2[idx], out->dres2[idx]);
    fprintf(stderr, ", expected = %.3lg {%la}", in->dref1[idx], in->dref1[idx]);
    if (layout == kVtoVV)
        fprintf(stderr, ", %.3lg {%la}", in->dref2[idx], in->dref2[idx]);
    fprintf(stderr, ", ulp = %.3lg\n", ulp);
    fflush(stderr);
    return;
} // own_print_ulp
// complex float
template <>
void own_print_ulp<std::complex<float>>(const char* fname, int layout, int acc, int idx, double ulp, VmInputData* in,
                                        VmOutputData* out) {
    char strbuff[name_len] = {0};
    fprintf(stderr, "\t\tULP_OVER_BOUND: %s[%d](", own_full_name(strbuff, name_len, fname, kCP, acc), idx);
    if (layout == kVRtoVC)
        fprintf(stderr, "%.2g {%a}", in->sarg1[idx], in->sarg1[idx]);
    else
        fprintf(stderr, "%.2g+i*%.2g {%a+i*%a}", (in->carg1[idx].real()), (in->carg1[idx].imag()), (in->carg1[idx].real()),
                (in->carg1[idx].imag()));
    if (layout == kVVtoV)
        fprintf(stderr, ", %.2g+i*%.2g {%a+i*%a}", (in->carg2[idx].real()), (in->carg2[idx].imag()), (in->carg2[idx].real()),
                (in->carg2[idx].imag()));
    if (layout == kVCtoVR)
        fprintf(stderr, ") = %.2g {%a}", out->csres1[idx], out->csres1[idx]);
    else
        fprintf(stderr, ") = %.2g+i*%.2g {%a+i*%a}", (out->cres1[idx].real()), (out->cres1[idx].imag()), (out->cres1[idx].real()),
                (out->cres1[idx].imag()));
    if (layout == kVCtoVR)
        fprintf(stderr, ", expected = %.3lg {%la}", in->csref1[idx], in->csref1[idx]);
    else
        fprintf(stderr, ", expected = %.3lg+i*%.3lg {%la+i*%la}", (in->cref1[idx].real()), (in->cref1[idx].imag()),
                (in->cref1[idx].real()), (in->cref1[idx].imag()));
    fprintf(stderr, ", ulp = %.3lg\n", ulp);
    fflush(stderr);
    return;
} // own_print_ulp
// complex double
template <>
void own_print_ulp<std::complex<double>>(const char* fname, int layout, int acc, int idx, double ulp, VmInputData* in,
                                         VmOutputData* out) {
    char strbuff[name_len] = {0};
    fprintf(stderr, "\t\tULP_OVER_BOUND: %s[%d](", own_full_name(strbuff, name_len, fname, kZP, acc), idx);
    if (layout == kVRtoVC)
        fprintf(stderr, "%.3lg {%la}", in->darg1[idx], in->darg1[idx]);
    else
        fprintf(stderr, "%.3lg+i*%.3lg {%la+i*%la}", (in->zarg1[idx].real()), (in->zarg1[idx].imag()), (in->zarg1[idx].real()),
                (in->zarg1[idx].imag()));
    if (layout == kVVtoV)
        fprintf(stderr, ", %.3lg+i*%.3lg {%la+i*%la}", (in->zarg2[idx].real()), (in->zarg2[idx].imag()), (in->zarg2[idx].real()),
                (in->zarg2[idx].imag()));
    if (layout == kVCtoVR)
        fprintf(stderr, ") = %.3lg {%la}", out->zdres1[idx], out->zdres1[idx]);
    else
        fprintf(stderr, ") = %.3lg+i*%.3lg {%la+i*%la}", (out->zres1[idx].real()), (out->zres1[idx].imag()),
                (out->zres1[idx].real()), (out->zres1[idx].imag()));
    if (layout == kVCtoVR)
        fprintf(stderr, ", expected = %.3lg {%la}", in->zdref1[idx], in->zdref1[idx]);
    else
        fprintf(stderr, ", expected = %.3lg+i*%.3lg {%la+i*%la}", (in->zref1[idx].real()), (in->zref1[idx].imag()),
                (in->zref1[idx].real()), (in->zref1[idx].imag()));
    fprintf(stderr, ", ulp = %.3lg\n", ulp);
    fflush(stderr);
    return;
} // own_print_ulp

/**
 * @brief Evaluation of one result
 *
 * Measure accuracy of one result (with specialization by precisions)
 *
 * @param[in] fname        Function name
 * @param[in] idx          Vector index
 * @param[in] layout       Function arguments layout
 * @param[in] acc          Function accuracy
 * @param[in] in           Input and reference resutl arrays
 * @param[out] out         Output arrays
 * @param[out] resulp      Resulted ulp
 * @return                 Pass or failure
 *
 */
template <typename T>
int own_evaluate_res(const char* fname, int idx, int layout, int acc, VmInputData* in, VmOutputData* out, double* resulp) {
    static_assert(FalseType<T>::value, "No implementation of own_evaluate_res for this type");
    return 0;
}
// float
template <>
int own_evaluate_res<float>(const char* fname, int idx, int layout, int acc, VmInputData* in, VmOutputData* out, double* resulp) {
    int err = 0, warn = 0;
    double ulp = own_compute_ulp(out->sres1[idx], in->sref1[idx]);
    if (layout == kVtoVV) {
        ulp = std::fmax(ulp, own_compute_ulp(out->sres2[idx], in->sref2[idx]));
    }
    out->sulp[acc] = std::fmax(out->sulp[acc], ulp);
    if (ulp > own_get_allowed_ulp<float>(acc)) {
        // VVX_V LinearFrac function has only EP implementation and allows higher ulp for HA/LA
        if ((layout == kVVXtoV) && (ulp < own_get_allowed_ulp<float>(kEP)))
            warn = 1;
        else
            err = 1;
    }
    out->swarn += warn;
    out->serr += err;
    *resulp = ulp;
    return (print_err && err) || (print_warn && warn);
} // own_evaluate_res
// double
template <>
int own_evaluate_res<double>(const char* fname, int idx, int layout, int acc, VmInputData* in, VmOutputData* out, double* resulp) {
    int err = 0, warn = 0;
    double ulp = own_compute_ulp(out->dres1[idx], in->dref1[idx]);
    if (layout == kVtoVV) {
        ulp = std::fmax(ulp, own_compute_ulp(out->dres2[idx], in->dref2[idx]));
    }
    out->dulp[acc] = std::fmax(out->dulp[acc], ulp);
    if (ulp > own_get_allowed_ulp<double>(acc)) {
        // VVX_V LinearFrac function has only EP implementation and allows higher ulp for HA/LA
        if ((layout == kVVXtoV) && (ulp < own_get_allowed_ulp<double>(kEP)))
            warn = 1;
        else
            err = 1;
    }
    out->dwarn += warn;
    out->derr += err;
    *resulp = ulp;
    return (print_err && err) || (print_warn && warn);
} // own_evaluate_res
// complex float
template <>
int own_evaluate_res<std::complex<float>>(const char* fname, int idx, int layout, int acc, VmInputData* in, VmOutputData* out,
                                          double* resulp) {
    int err = 0, warn = 0;
    double ulp = 0.0;
    // VC_VR complex-to-real VM functions
    // use the same complex arrays as containers
    // for real results
    if (layout == kVCtoVR) {
        ulp = own_compute_ulp(out->csres1[idx], in->csref1[idx]);
    } else {
        ulp = own_compute_ulp(out->cres1[idx], in->cref1[idx]);
    }
    out->culp[acc] = std::fmax(out->culp[acc], ulp);
    if (ulp > own_get_allowed_ulp<std::complex<float>>(acc)) {
        // HA/LA complex VM function may show errors because of
        // less precise reference standard C++ math functions,
        // reported as warnings
        if (ulp < own_get_allowed_ulp<std::complex<float>>(kEP))
            warn = 1;
        else
            err = 1;
    }
    out->cwarn += warn;
    out->cerr += err;
    *resulp = ulp;
    return (print_err && err) || (print_warn && warn);
} // own_evaluate_res
// complex double
template <>
int own_evaluate_res<std::complex<double>>(const char* fname, int idx, int layout, int acc, VmInputData* in, VmOutputData* out,
                                           double* resulp) {
    int err = 0, warn = 0;
    double ulp = 0.0;
    // VC_VR complex-to-real VM functions
    // use the same complex arrays as containers
    // for real results
    if (layout == kVCtoVR) {
        ulp = own_compute_ulp(out->zdres1[idx], in->zdref1[idx]);
    } else {
        ulp = own_compute_ulp(out->zres1[idx], in->zref1[idx]);
    }
    out->zulp[acc] = std::fmax(out->zulp[acc], ulp);
    if (ulp > own_get_allowed_ulp<std::complex<double>>(acc)) {
        // HA/LA complex VM function may show errors because of
        // less precise reference standard C++ math functions,
        // reported as warnings
        if (ulp < own_get_allowed_ulp<std::complex<double>>(kEP))
            warn = 1;
        else
            err = 1;
    }
    out->zwarn += warn;
    out->zerr += err;
    *resulp = ulp;
    return (print_err && err) || (print_warn && warn);
} // own_evaluate_res

/**
 * @brief Evaluation of VM function results
 *
 * Measure accuracy of VM function family results
 * in comparison to reference scalar implementations.
 *
 * @param[in]  fname       Function name
 * @param[in]  layout      Function arguments layout
 * @param[in]  acc         Function accuracy
 * @param[in]  in          Input and reference resutl arrays
 * @param[out] out         Output arrays
 * @return                 Total number of errors
 *
 */
template <typename T> void own_evaluate_func(const char* fname, int layout, int acc, VmInputData* in, VmOutputData* out) {
    int printed = 0;
    double ulp = 0.0;
    for (int i = 0; i < length; i = i + 1) {
        if ((own_evaluate_res<T>(fname, i, layout, acc, in, out, &ulp)) && (printed < max_printed)) {
            own_print_ulp<T>(fname, layout, acc, i, ulp, in, out);
            printed++;
        }
    }
    return;
} // own_evaluate_func

/**
 * @brief Evaluation of one VM functions family
 *
 * Measure accuracy on VM functions family in comparison to reference scalar implementations.
 *
 * @param[in] queue        Sycl queue
 * @param[in] fname        Function name
 * @param[in] beg          Begin of input ranges
 * @param[in] end          End of input ranges
 * @param[in] vmfunc       Pointer to VM functions launcher
 * @param[in] reffunc      Real reference function
 * @param[in] creffunc     Complex reference function
 * @param[in] layout       Function arguments layout
 * @param[in] in           Input and reference resutl arrays
 * @param[out] out         Output arrays
 * @return                 Total number of errors
 *
 */
template <typename R, typename C>
int own_evaluate(sycl::queue& q, const char* fname, double beg, double end, VmFunc vmfunc, R reffunc, C creffunc, int layout,
                 VmInputData* in, VmOutputData* out) {
    own_fill_input(q, beg, end, in);                   // Fill input vectors by random values
    own_fill_reference(layout, reffunc, creffunc, in); // Fill reference results
    own_fill_output(q, out);                           // Fill result vectors by constants

    // Loop by all available accuracies (HA/LA/EP)
    for (int a = kHA; a < kAccNum; a = a + 1) {
        // Call all variants of VM function
        vmfunc(q, a, in, out);
        // Spill device results to host
        own_transfer_to_host(q, out);
        // Evaluate real function results
        if (in->has_real) {
            own_evaluate_func<float>(fname, layout, a, in, out);
            own_evaluate_func<double>(fname, layout, a, in, out);
        }
        // Evaluate complex function results
        if (in->has_complex) {
            own_evaluate_func<std::complex<float>>(fname, layout, a, in, out);
            own_evaluate_func<std::complex<double>>(fname, layout, a, in, out);
        }
    }
    // Print overall real function results
    if (in->has_real) {
        fprintf(stdout, "\ts%-11s, ha:,%7.2lg, la:, %7.2lg, ep:, %7.2lg, %s\n", fname, out->sulp[kHA], out->sulp[kLA],
                out->sulp[kEP],
                (out->serr)    ? "OVER"
                : (out->swarn) ? "WARN"
                               : "NORM");
        fprintf(stdout, "\td%-11s, ha:,%7.2lg, la:, %7.2lg, ep:, %7.2lg, %s\n", fname, out->dulp[kHA], out->dulp[kLA],
                out->dulp[kEP],
                (out->derr)    ? "OVER"
                : (out->dwarn) ? "WARN"
                               : "NORM");
    }
    // Print overall complex function results
    if (in->has_complex) {
        fprintf(stdout, "\tc%-11s, ha:,%7.2lg, la:, %7.2lg, ep:, %7.2lg, %s\n", fname, out->culp[kHA], out->culp[kLA],
                out->culp[kEP],
                (out->cerr)    ? "OVER"
                : (out->cwarn) ? "WARN"
                               : "NORM");
        fprintf(stdout, "\tz%-11s, ha:,%7.2lg, la:, %7.2lg, ep:, %7.2lg, %s\n", fname, out->zulp[kHA], out->zulp[kLA],
                out->zulp[kEP],
                (out->zerr)    ? "OVER"
                : (out->zwarn) ? "WARN"
                               : "NORM");
    }
    fflush(stdout);
    // Return total number of errors
    return (out->serr + out->derr + out->cerr + out->zerr);
} // own_evaluate_funcs

/**
 * @brief Run VM test on device
 *
 * Performs accuracy testing of all VM functions on selected device
 *
 * @param[in] dev    Sycl device
 *
 * @return           Number of errors
 *
 */
int own_run_on(sycl::device& dev) {
    int err = 0; // Total errors

    sycl::queue q{dev, own_async_sycl_error}; // Create sycl queue

    own_preamble(dev); // Print sycl device info

    VmInputData in;   // Input data
    VmOutputData out; // Output data

    own_allocate_data(q, &in, &out); // Allocate input and output data memory

    fprintf(stdout, "\t===========================================================\n");
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Asin", -0.9, 0.9, own_vm_asin, asin, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Acos", -0.9, 0.9, own_vm_acos, acos, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Atan", -10000, 10000, own_vm_atan, atan, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Atan2", -10000, 10000, own_vm_atan2, atan2, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Asinh", -10000, 10000, own_vm_asinh, asinh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Acosh", 1.01, 1000, own_vm_acosh, acosh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Atanh", -0.9, 0.9, own_vm_atanh, atanh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sin", -10, 10, own_vm_sin, sin, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Cos", -10, 10, own_vm_cos, cos, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Tan", -10, 10, own_vm_tan, tan, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoVV, CRefVtoVV>(q, "SinCos", -10000, 10000, own_vm_sincos, own_sincos, NULL, kVtoVV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sinh", -50, 50, own_vm_sinh, sinh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Cosh", -50, 50, own_vm_cosh, cosh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Tanh", -5, 5, own_vm_tanh, tanh, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Exp", -75, 75, own_vm_exp, exp, own_cexp, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Exp2", -30, 30, own_vm_exp2, exp2, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Expm1", -30, 30, own_vm_expm1, expm1, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Ln", 1.01, 100000, own_vm_ln, log, own_clog, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Log2", 1.01, 100000, own_vm_log2, log2, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Log1p", 0.01, 100000, own_vm_log1p, log1p, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Pow", 0.1, 10, own_vm_pow, pow, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Powr", 0.1, 10, own_vm_powr, powr, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Powx", 0.1, 10, own_vm_powx, pow, NULL, kVXtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Pow2o3", 0.1, 10, own_vm_pow2o3, own_pow2o3, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Pow3o2", 0.1, 10, own_vm_pow3o2, own_pow3o2, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sqrt", 0.1, 100, own_vm_sqrt, sqrt, own_csqrt, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Cbrt", 0.1, 10000, own_vm_cbrt, cbrt, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "InvSqrt", 0.1, 10000, own_vm_invsqrt, own_invsqrt, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "InvCbrt", 0.1, 10000, own_vm_invcbrt, own_invcbrt, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Hypot", -10000, 10000, own_vm_hypot, hypot, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Erf", -5, 5, own_vm_erf, erf, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Erfc", -2, 5, own_vm_erfc, erfc, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Ceil", -10000, 10000, own_vm_ceil, ceil, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Floor", -10000, 10000, own_vm_floor, floor, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Round", -10000, 10000, own_vm_round, round, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Trunc", -10000, 10000, own_vm_trunc, trunc, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Rint", -10000, 10000, own_vm_rint, rint, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "NearbyInt", -10000, 10000, own_vm_nearbyint, nearbyint, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Remainder", -10000, 10000, own_vm_remainder, remainder, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Add", -10000, 10000, own_vm_add, own_add, own_cadd, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Sub", -10000, 10000, own_vm_sub, own_sub, own_csub, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Mul", -10000, 10000, own_vm_mul, own_mul, own_cmul, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Div", -10000, 10000, own_vm_div, own_div, own_cdiv, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sqr", -10000, 10000, own_vm_sqr, own_sqr, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Inv", -10000, 10000, own_vm_inv, own_inv, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoVV, CRefVtoVV>(q, "Modf", -10000, 10000, own_vm_modf, own_modf, NULL, kVtoVV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Fmod", -10000, 10000, own_vm_fmod, fmod, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Fdim", -10000, 10000, own_vm_fdim, fdim, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Fmax", -10000, 10000, own_vm_fmax, fmax, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Fmin", -10000, 10000, own_vm_fmin, fmin, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "MaxMag", -10000, 10000, own_vm_maxmag, own_maxmag, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "MinMag", -10000, 10000, own_vm_minmag, own_minmag, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "NextAfter", -10000, 10000, own_vm_nextafter, nextafter, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "CopySign", -10000, 10000, own_vm_copysign, copysign, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Frac", -10000, 10000, own_vm_frac, own_frac, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Conj", -10000, 10000, own_vm_conj, NULL, own_conj, kVtoV, &in, &out);
    err +=
        own_evaluate<RefVVtoV, CRefVVtoV>(q, "MulByConj", -10000, 10000, own_vm_mulbyconj, NULL, own_cmulbyconj, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefRtoC>(q, "CIS", -10000, 10000, own_vm_cis, NULL, own_cis, kVRtoVC, &in, &out);
    err += own_evaluate<RefVtoV, CRefCtoR>(q, "Arg", -10000, 10000, own_vm_arg, NULL, own_carg, kVCtoVR, &in, &out);
    err += own_evaluate<RefVtoV, CRefCtoR>(q, "Abs", -10000, 10000, own_vm_abs, fabs, own_cabs, kVCtoVR, &in, &out);
    /* Functions with Intel-specific reference LIBM implementations */
#if (defined __INTEL_LLVM_COMPILER)
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Asinpi", -0.9, 0.9, own_vm_asinpi, asinpi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Acospi", -0.9, 0.9, own_vm_acospi, acospi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Atanpi", -10000, 10000, own_vm_atanpi, atanpi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVVtoV, CRefVVtoV>(q, "Atan2pi", -10000, 10000, own_vm_atan2pi, atan2pi, NULL, kVVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sind", -10000, 10000, own_vm_sind, sind, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Cosd", -10000, 10000, own_vm_cosd, cosd, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Tand", -10000, 10000, own_vm_tand, tand, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Sinpi", -10000, 10000, own_vm_sinpi, sinpi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Cospi", -10000, 10000, own_vm_cospi, cospi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Tanpi", -10000, 10000, own_vm_tanpi, tanpi, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Exp10", -30, 30, own_vm_exp10, exp10, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "Log10", 1.01, 100, own_vm_log10, log10, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "ErfInv", -0.9, 0.9, own_vm_erfinv, erfinv, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "ErfcInv", -0.1, 1.9, own_vm_erfcinv, erfcinv, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "CdfNorm", -4, 4, own_vm_cdfnorm, cdfnorm, NULL, kVtoV, &in, &out);
    err += own_evaluate<RefVtoV, CRefVtoV>(q, "CdfNormInv", -0.1, 0.9, own_vm_cdfnorminv, cdfnorminv, NULL, kVtoV, &in, &out);
#endif
    fprintf(stdout, "\t===========================================================\n");

    own_deallocate_data(q, &in, &out); // Free allocated memory

    return (err > 0) ? -1 : 0;
} // own_run_on

//
// Main entry point for example.
//
// Dispatches to appropriate device types as set at build time with flag:
// -DSYCL_DEVICES_cpu -- only runs SYCL CPU device
// -DSYCL_DEVICES_gpu -- only runs SYCL GPU device
// -DSYCL_DEVICES_all (default) -- runs on all: CPU and GPU devices
//
//  For each device selected and each data type supported, the example
//  runs with all supported data types
//
int main(int argc, char** argv) {
    int ret = 0; // return status
    fprintf(stdout, "sycl vm_device_all_funcs: started...\n");
    fflush(stdout);

    // List of available devices
    std::list<my_sycl_device_types> list_of_devices;
    set_list_of_devices(list_of_devices);

    // Loop by all available devices
    for (auto dev_type : list_of_devices) {
        sycl::device my_dev;
        bool my_dev_is_found = false;
        get_sycl_device(my_dev, my_dev_is_found, dev_type);

        // Run tests if the device is available
        if (my_dev_is_found) {
            fprintf(stdout, "Running tests on %s.\n", sycl_device_names[dev_type].c_str());
            fflush(stdout);
            try {
                ret |= own_run_on(my_dev);
            } catch (sycl::exception const& e) {
                fprintf(stderr, "sycl::exception caught. %s\n", e.what());
                ret = 1;
            } catch (std::exception const& e) {
                fprintf(stderr, "std::exception caught. %s\n", e.what());
                ret = 1;
            }
        } else {
            fprintf(stderr, "No %s devices found; skipping %s tests.\n", sycl_device_names[dev_type].c_str(),
                    sycl_device_names[dev_type].c_str());
        }
    }

    fflush(stdout);
    fprintf(stdout, "Sycl vm_device_all_funcs: accuracy: %s\n\n", ret != 0 ? "over bounds" : "normal");

    return ret;
} // main
