matrixMul例子改为athread版的各种疑问



  • 尝试着把手册里的 matrixMul 这个代码改为 athread 版的时候遇到一些问题。

    问题1:PE_MODE 怎么才能用?

    slave.c: In function 'func':
    slave.c:24: error: 'PE_MODE' undeclared (first use in this function)
    slave.c:24: error: (Each undeclared identifier is reported only once
    slave.c:24: error: for each function it appears in.)
    

    这个 PE_MODE 找不到吗?我于是把 PE_MODE 直接定义为 0 了。

    问题2:__thread_local 怎么使用?
    按手册里写了这样一句:

    __thread_local volatile unsigned long get_reply, put_reply;
    

    但是编译不通过:

    slave.c:10: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'volatile'
    slave.c: In function 'func':
    slave.c:23: error: 'get_reply' undeclared (first use in this function)
    slave.c:23: error: (Each undeclared identifier is reported only once
    slave.c:23: error: for each function it appears in.)
    slave.c:39: error: 'put_reply' undeclared (first use in this function)
    

    于是我改成局部变量了,请问这个 __thread_local 怎么正确使用呢?

    更新
    我已经知道了,slave.h 不是自己的头文件,是这个:/usr/sw-mpp/swcc/sw5gcc-binary/include/slave.h

    附1:编译

    $ sw5cc -host -O3 -c matrixMul.c
    $ sw5cc -slave -O3 -c slave.c
    $ sw5cc -hybrid matrixMul.o slave.o -o test
    

    附2:提交

    $ bsub -I -b -q q_sw_share -n 1 -cgsp 64 ./test
    

    附3:源文件

    /// \file matrixMul.c
    #include <athread.h>
    #include <stdio.h>
    #include <sys/time.h>
    #include "slave.h"
    
    #define M 1024
    #define N 1024
    #define K 1024
    
    double A[M][N];
    double B[N][K];
    double C[M][K];
    
    extern SLAVE_FUN(func)();
    
    // Timer
    static inline unsigned long rpcc() {
        unsigned long time;
        asm("rtc %0": "=r" (time) : );
        return time;
    }
    
    
    int main() {
        int i, j, k;
        unsigned long count;
    
        //--------------------------------------------------------------------------
        // Matrix multiplication on host
        //--------------------------------------------------------------------------
        // init A, B
        for (i = 0; i < M; i++)
            for (j = 0; j < N; j++)
                A[i][j] = i;
    
        for (i = 0; i < N; i++)
            for (j = 0; j < K; j++)
                B[i][j] = j;
    
        count = -rpcc();
    
        // on host:  multiplication C = A*B
        for (i = 0; i < M; i++)
            for (k = 0; k < K; k++)
                for (j = 0; j < N; j++)
                    C[i][k] += A[i][j] * B[j][k];
    
        count += rpcc();
        printf("Host: Matrix multiplication A[%d][%d] * B[%d][%d], counter: %ld\n", M, N, N, K, count);
    
    
        //--------------------------------------------------------------------------
        // Matrix multiplication on device
        //--------------------------------------------------------------------------
        // init A, B
        for (i = 0; i < M; i++)
            for (j = 0; j < N; j++)
                A[i][j] = i;
    
        for (i = 0; i < N; i++)
            for (j = 0; j < K; j++)
                B[i][j] = j;
    
        athread_init();
        //athread_set_num_threads(64);
        count = -rpcc();
    
        // on device:  multiplication C = A*B
        athread_spawn(func, 0);
        athread_join();
    
        count += rpcc();
        printf("Device: Matrix multiplication A[%d][%d] * B[%d][%d], counter: %ld\n", M, N, N, K, count);
        athread_halt();
    
        return 0;
    }
    
    /// \file slave.h
    void func();
    
    /// \file slave.c
    #include "slave.h"
    
    #define M 1024
    #define N 1024
    #define K 1024
    #define PE_MODE 0   // error: PE_MODE undeclared
    
    extern double A[M][N], B[N][K], C[M][K];
    
    //__thread_local volatile unsigned long get_reply, put_reply; // error: expected '=', ',', ';', 'asm' or '__attribute__' before 'volatile'
    
    void func() {
        volatile unsigned long get_reply, put_reply;
        double A_dev[N], B_dev[4][K], C_dev[K];
        int tid, tsize, round, roundsize;
        int i, j, k;
    
        tid = athread_get_id(-1);
        //tsize = athread_get_max_threads();    // error: slave_athread_get_max_threads undeclared
        tsize = 64;
        roundsize = N / 4;
    
        while (tid < M) {
            // Fetch a single row of A
            get_reply = 0;
            athread_get(PE_MODE, &A[tid][0], &A_dev[0], N*8, &get_reply, 0, 0, 0);
            while (get_reply != 2);
    
            // Matrix-vector multiplication
            for (round = 0; round < roundsize; round++) {
                // Fetch B
                get_reply = 0;
                athread_get(PE_MODE, &B[4 * round][0], &B_dev[0][0], 4*K*8, &get_reply, 0, 0, 0);
                while (get_reply != 2);
    
                // Partial multiplication
                for (k = 0; k < K; k++)
                    for (j = 0; j < 4; j++)
                        C_dev[k] += A_dev[j] * B_dev[j][k];
            }
    
            // Send the single row of C back to host
            put_reply = 0;
            athread_put(PE_MODE, &C_dev[0], &C[tid][0], K*8, &put_reply, 0, 0, 0);
            while (put_reply != 1) ;
    
            tid += tsize;
        }
    }
    

    matrixMul.zip


登录后回复