Program Listing for File rocrand_mtgp32.h

Return to documentation for file (library/include/rocrand/rocrand_mtgp32.h)

// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

/*
 * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
 * University.  All rights reserved.
 * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
 * University and University of Tokyo.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 *       copyright notice, this list of conditions and the following
 *       disclaimer in the documentation and/or other materials provided
 *       with the distribution.
 *     * Neither the name of the Hiroshima University nor the names of
 *       its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef ROCRAND_MTGP32_H_
#define ROCRAND_MTGP32_H_

#include <stdlib.h>

#ifndef FQUALIFIERS
#define FQUALIFIERS __forceinline__ __device__
#endif // FQUALIFIERS_

#include "rocrand/rocrand.h"
#include "rocrand/rocrand_common.h"

#define MTGP_MEXP 11213
#define MTGP_N 351
#define MTGP_FLOOR_2P 256
#define MTGP_CEIL_2P 512
#define MTGP_TN MTGP_FLOOR_2P
#define MTGP_LS (MTGP_TN * 3)
#define MTGP_BN_MAX 512
#define MTGP_TS 16
#define MTGP_STATE 1024
#define MTGP_MASK 1023

// Source: https://github.com/MersenneTwister-Lab/MTGP/blob/master/mtgp32-fast.h
struct mtgp32_params_fast_t {
    int mexp;
    int pos;
    int sh1;
    int sh2;
    uint32_t tbl[16];
    uint32_t tmp_tbl[16];
    uint32_t flt_tmp_tbl[16];
    uint32_t mask;
    unsigned char poly_sha1[21];
};

namespace rocrand_device {

struct mtgp32_params
{
    unsigned int pos_tbl[MTGP_BN_MAX];
    unsigned int param_tbl[MTGP_BN_MAX][MTGP_TS];
    unsigned int temper_tbl[MTGP_BN_MAX][MTGP_TS];
    unsigned int single_temper_tbl[MTGP_BN_MAX][MTGP_TS];
    unsigned int sh1_tbl[MTGP_BN_MAX];
    unsigned int sh2_tbl[MTGP_BN_MAX];
    unsigned int mask[1];
};

typedef mtgp32_params_fast_t mtgp32_fast_params;

struct mtgp32_state
{
    int offset;
    int id;
    unsigned int status[MTGP_STATE];
};

inline
void rocrand_mtgp32_init_state(unsigned int array[],
                               const mtgp32_fast_params *para, unsigned int seed)
{
    int i;
    int size = para->mexp / 32 + 1;
    unsigned int hidden_seed;
    unsigned int tmp;
    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
    tmp = hidden_seed;
    tmp += tmp >> 16;
    tmp += tmp >> 8;
    memset(array, tmp & 0xff, sizeof(unsigned int) * size);
    array[0] = seed;
    array[1] = hidden_seed;
    for (i = 1; i < size; i++)
        array[i] ^= (1812433253) * (array[i - 1] ^ (array[i - 1] >> 30)) + i;
}

class mtgp32_engine
{
public:
    FQUALIFIERS
    mtgp32_engine()
    {

    }

    FQUALIFIERS
    mtgp32_engine(const mtgp32_state m_state,
                  const mtgp32_params * params,
                  int bid)
    {
        this->m_state = m_state;
        pos_tbl = params->pos_tbl[bid];
        sh1_tbl = params->sh1_tbl[bid];
        sh2_tbl = params->sh2_tbl[bid];
        mask = params->mask[0];
        for (int j = 0; j < MTGP_TS; j++) {
            param_tbl[j] = params->param_tbl[bid][j];
            temper_tbl[j] = params->temper_tbl[bid][j];
            single_temper_tbl[j] = params->single_temper_tbl[bid][j];
        }
    }

    FQUALIFIERS
    void copy(const mtgp32_engine * m_engine)
    {
#if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
        const unsigned int thread_id = threadIdx.x;
        for(int i = thread_id; i < MTGP_STATE; i += blockDim.x)
            m_state.status[i] = m_engine->m_state.status[i];

        if (thread_id == 0)
        {
            m_state.offset = m_engine->m_state.offset;
            m_state.id = m_engine->m_state.id;
            pos_tbl = m_engine->pos_tbl;
            sh1_tbl = m_engine->sh1_tbl;
            sh2_tbl = m_engine->sh2_tbl;
            mask = m_engine->mask;
        }
        if (thread_id < MTGP_TS)
        {
            param_tbl[thread_id] = m_engine->param_tbl[thread_id];
            temper_tbl[thread_id] = m_engine->temper_tbl[thread_id];
            single_temper_tbl[thread_id] = m_engine->single_temper_tbl[thread_id];
        }
        __syncthreads();
#else
        this->m_state = m_engine->m_state;
        pos_tbl = m_engine->pos_tbl;
        sh1_tbl = m_engine->sh1_tbl;
        sh2_tbl = m_engine->sh2_tbl;
        mask = m_engine->mask;
        for (int j = 0; j < MTGP_TS; j++) {
            param_tbl[j] = m_engine->param_tbl[j];
            temper_tbl[j] = m_engine->temper_tbl[j];
            single_temper_tbl[j] = m_engine->single_temper_tbl[j];
        }
#endif
    }

    FQUALIFIERS
    void set_params(mtgp32_params * params)
    {
        pos_tbl = params->pos_tbl[m_state.id];
        sh1_tbl = params->sh1_tbl[m_state.id];
        sh2_tbl = params->sh2_tbl[m_state.id];
        mask = params->mask[0];
        for (int j = 0; j < MTGP_TS; j++) {
            param_tbl[j] = params->param_tbl[m_state.id][j];
            temper_tbl[j] = params->temper_tbl[m_state.id][j];
            single_temper_tbl[j] = params->single_temper_tbl[m_state.id][j];
        }
    }

    FQUALIFIERS
    unsigned int operator()()
    {
        return this->next();
    }

    FQUALIFIERS
    unsigned int next()
    {
#if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
        unsigned int t   = threadIdx.x;
        unsigned int d   = blockDim.x;
        int pos = pos_tbl;
        unsigned int r;
        unsigned int o;

        r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
                     m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
                     m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
        m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;

        o = temper(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
        __syncthreads();
        if (t == 0)
            m_state.offset = (m_state.offset + d) & MTGP_MASK;
        __syncthreads();
        return o;
#else
        return 0;
#endif
    }

    FQUALIFIERS
    unsigned int next_single()
    {
#if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
        unsigned int t   = threadIdx.x;
        unsigned int d   = blockDim.x;
        int pos = pos_tbl;
        unsigned int r;
        unsigned int o;

        r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
                     m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
                     m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
        m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;

        o = temper_single(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
        __syncthreads();
        if (t == 0)
            m_state.offset = (m_state.offset + d) & MTGP_MASK;
        __syncthreads();
        return o;
#else
        return 0;
#endif
    }

private:
    FQUALIFIERS
    unsigned int para_rec(unsigned int X1, unsigned int X2, unsigned int Y)
    {
        unsigned int X = (X1 & mask) ^ X2;
        unsigned int MAT;

        X ^= X << sh1_tbl;
        Y = X ^ (Y >> sh2_tbl);
        MAT = param_tbl[Y & 0x0f];
        return Y ^ MAT;
    }

    FQUALIFIERS
    unsigned int temper(unsigned int V, unsigned int T)
    {
        unsigned int MAT;

        T ^= T >> 16;
        T ^= T >> 8;
        MAT = temper_tbl[T & 0x0f];
        return V ^ MAT;
    }

    FQUALIFIERS
    unsigned int temper_single(unsigned int V, unsigned int T)
    {
        unsigned int MAT;
        unsigned int r;

        T ^= T >> 16;
        T ^= T >> 8;
        MAT = single_temper_tbl[T & 0x0f];
        r = (V >> 9) ^ MAT;
        return r;
    }

public:
    // State
    mtgp32_state m_state;
    // Parameters
    unsigned int pos_tbl;
    unsigned int param_tbl[MTGP_TS];
    unsigned int temper_tbl[MTGP_TS];
    unsigned int sh1_tbl;
    unsigned int sh2_tbl;
    unsigned int single_temper_tbl[MTGP_TS];
    unsigned int mask;

}; // mtgp32_engine class

} // end namespace rocrand_device

typedef rocrand_device::mtgp32_engine rocrand_state_mtgp32;
typedef rocrand_device::mtgp32_state mtgp32_state;
typedef rocrand_device::mtgp32_fast_params mtgp32_fast_params;
typedef rocrand_device::mtgp32_params mtgp32_params;

__host__ inline
rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32 * d_state,
                                         mtgp32_fast_params params[],
                                         int n,
                                         unsigned long long seed)
{
    int i;
    rocrand_state_mtgp32 * h_state = (rocrand_state_mtgp32 *) malloc(sizeof(rocrand_state_mtgp32) * n);
    seed = seed ^ (seed >> 32);

    if (h_state == NULL)
        return ROCRAND_STATUS_ALLOCATION_FAILED;

    for (i = 0; i < n; i++) {
        rocrand_device::rocrand_mtgp32_init_state(&(h_state[i].m_state.status[0]), &params[i], (unsigned int)seed + i + 1);
        h_state[i].m_state.offset = 0;
        h_state[i].m_state.id = i;
        h_state[i].pos_tbl = params[i].pos;
        h_state[i].sh1_tbl = params[i].sh1;
        h_state[i].sh2_tbl = params[i].sh2;
        h_state[i].mask = params[0].mask;
        for (int j = 0; j < MTGP_TS; j++) {
            h_state[i].param_tbl[j] = params[i].tbl[j];
            h_state[i].temper_tbl[j] = params[i].tmp_tbl[j];
            h_state[i].single_temper_tbl[j] = params[i].flt_tmp_tbl[j];
        }
    }

    hipMemcpy(d_state, h_state, sizeof(rocrand_state_mtgp32) * n, hipMemcpyHostToDevice);
    free(h_state);

    if (hipGetLastError() != hipSuccess)
        return ROCRAND_STATUS_ALLOCATION_FAILED;

    return ROCRAND_STATUS_SUCCESS;
}

__host__ inline
rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_params * p)
{
    const int block_num = MTGP_BN_MAX;
    const int size1 = sizeof(uint32_t) * block_num;
    const int size2 = sizeof(uint32_t) * block_num * MTGP_TS;
    uint32_t *h_pos_tbl;
    uint32_t *h_sh1_tbl;
    uint32_t *h_sh2_tbl;
    uint32_t *h_param_tbl;
    uint32_t *h_temper_tbl;
    uint32_t *h_single_temper_tbl;
    uint32_t *h_mask;
    h_pos_tbl = (uint32_t *)malloc(size1);
    h_sh1_tbl = (uint32_t *)malloc(size1);
    h_sh2_tbl = (uint32_t *)malloc(size1);
    h_param_tbl = (uint32_t *)malloc(size2);
    h_temper_tbl = (uint32_t *)malloc(size2);
    h_single_temper_tbl = (uint32_t *)malloc(size2);
    h_mask = (uint32_t *)malloc(sizeof(uint32_t));
    rocrand_status status = ROCRAND_STATUS_SUCCESS;

    if (h_pos_tbl == NULL || h_sh1_tbl == NULL || h_sh2_tbl == NULL
        || h_param_tbl == NULL || h_temper_tbl == NULL || h_single_temper_tbl == NULL
        || h_mask == NULL) {
        printf("failure in allocating host memory for constant table.\n");
        status = ROCRAND_STATUS_ALLOCATION_FAILED;
    }
    else {
        h_mask[0] = params[0].mask;
        for (int i = 0; i < block_num; i++) {
            h_pos_tbl[i] = params[i].pos;
            h_sh1_tbl[i] = params[i].sh1;
            h_sh2_tbl[i] = params[i].sh2;
            for (int j = 0; j < MTGP_TS; j++) {
                h_param_tbl[i * MTGP_TS + j] = params[i].tbl[j];
                h_temper_tbl[i * MTGP_TS + j] = params[i].tmp_tbl[j];
                h_single_temper_tbl[i * MTGP_TS + j] = params[i].flt_tmp_tbl[j];
            }
        }

        if (hipMemcpy(p->pos_tbl, h_pos_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->sh1_tbl, h_sh1_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->sh2_tbl, h_sh2_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->param_tbl, h_param_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->temper_tbl, h_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->single_temper_tbl, h_single_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
        if (hipMemcpy(p->mask, h_mask, sizeof(unsigned int), hipMemcpyHostToDevice) != hipSuccess)
            status = ROCRAND_STATUS_ALLOCATION_FAILED;
    }

    free(h_pos_tbl);
    free(h_sh1_tbl);
    free(h_sh2_tbl);
    free(h_param_tbl);
    free(h_temper_tbl);
    free(h_single_temper_tbl);
    free(h_mask);

    return status;
}

FQUALIFIERS
unsigned int rocrand(rocrand_state_mtgp32 * state)
{
    return state->next();
}

FQUALIFIERS
void rocrand_mtgp32_block_copy(rocrand_state_mtgp32 * src, rocrand_state_mtgp32 * dest)
{
    dest->copy(src);
}

FQUALIFIERS
void rocrand_mtgp32_set_params(rocrand_state_mtgp32 * state, mtgp32_params * params)
{
    state->set_params(params);
}

 // end of group rocranddevice
#endif // ROCRAND_MTGP32_H_