matrixMul例子改为athread版的各种疑问
-
尝试着把手册里的 matrixMul 这个代码改为 athread 版的时候遇到一些问题。
问题1:PE_MODE 怎么才能用?
slave.c: In function 'func': slave.c:24: error: 'PE_MODE' undeclared (first use in this function) slave.c:24: error: (Each undeclared identifier is reported only once slave.c:24: error: for each function it appears in.)
这个
PE_MODE
找不到吗?我于是把PE_MODE
直接定义为0
了。问题2:__thread_local 怎么使用?
按手册里写了这样一句:__thread_local volatile unsigned long get_reply, put_reply;
但是编译不通过:
slave.c:10: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'volatile' slave.c: In function 'func': slave.c:23: error: 'get_reply' undeclared (first use in this function) slave.c:23: error: (Each undeclared identifier is reported only once slave.c:23: error: for each function it appears in.) slave.c:39: error: 'put_reply' undeclared (first use in this function)
于是我改成局部变量了,请问这个
__thread_local
怎么正确使用呢?更新
我已经知道了,slave.h
不是自己的头文件,是这个:/usr/sw-mpp/swcc/sw5gcc-binary/include/slave.h
。附1:编译
$ sw5cc -host -O3 -c matrixMul.c $ sw5cc -slave -O3 -c slave.c $ sw5cc -hybrid matrixMul.o slave.o -o test
附2:提交
$ bsub -I -b -q q_sw_share -n 1 -cgsp 64 ./test
附3:源文件
/// \file matrixMul.c #include <athread.h> #include <stdio.h> #include <sys/time.h> #include "slave.h" #define M 1024 #define N 1024 #define K 1024 double A[M][N]; double B[N][K]; double C[M][K]; extern SLAVE_FUN(func)(); // Timer static inline unsigned long rpcc() { unsigned long time; asm("rtc %0": "=r" (time) : ); return time; } int main() { int i, j, k; unsigned long count; //-------------------------------------------------------------------------- // Matrix multiplication on host //-------------------------------------------------------------------------- // init A, B for (i = 0; i < M; i++) for (j = 0; j < N; j++) A[i][j] = i; for (i = 0; i < N; i++) for (j = 0; j < K; j++) B[i][j] = j; count = -rpcc(); // on host: multiplication C = A*B for (i = 0; i < M; i++) for (k = 0; k < K; k++) for (j = 0; j < N; j++) C[i][k] += A[i][j] * B[j][k]; count += rpcc(); printf("Host: Matrix multiplication A[%d][%d] * B[%d][%d], counter: %ld\n", M, N, N, K, count); //-------------------------------------------------------------------------- // Matrix multiplication on device //-------------------------------------------------------------------------- // init A, B for (i = 0; i < M; i++) for (j = 0; j < N; j++) A[i][j] = i; for (i = 0; i < N; i++) for (j = 0; j < K; j++) B[i][j] = j; athread_init(); //athread_set_num_threads(64); count = -rpcc(); // on device: multiplication C = A*B athread_spawn(func, 0); athread_join(); count += rpcc(); printf("Device: Matrix multiplication A[%d][%d] * B[%d][%d], counter: %ld\n", M, N, N, K, count); athread_halt(); return 0; }
/// \file slave.h void func();
/// \file slave.c #include "slave.h" #define M 1024 #define N 1024 #define K 1024 #define PE_MODE 0 // error: PE_MODE undeclared extern double A[M][N], B[N][K], C[M][K]; //__thread_local volatile unsigned long get_reply, put_reply; // error: expected '=', ',', ';', 'asm' or '__attribute__' before 'volatile' void func() { volatile unsigned long get_reply, put_reply; double A_dev[N], B_dev[4][K], C_dev[K]; int tid, tsize, round, roundsize; int i, j, k; tid = athread_get_id(-1); //tsize = athread_get_max_threads(); // error: slave_athread_get_max_threads undeclared tsize = 64; roundsize = N / 4; while (tid < M) { // Fetch a single row of A get_reply = 0; athread_get(PE_MODE, &A[tid][0], &A_dev[0], N*8, &get_reply, 0, 0, 0); while (get_reply != 2); // Matrix-vector multiplication for (round = 0; round < roundsize; round++) { // Fetch B get_reply = 0; athread_get(PE_MODE, &B[4 * round][0], &B_dev[0][0], 4*K*8, &get_reply, 0, 0, 0); while (get_reply != 2); // Partial multiplication for (k = 0; k < K; k++) for (j = 0; j < 4; j++) C_dev[k] += A_dev[j] * B_dev[j][k]; } // Send the single row of C back to host put_reply = 0; athread_put(PE_MODE, &C_dev[0], &C[tid][0], K*8, &put_reply, 0, 0, 0); while (put_reply != 1) ; tid += tsize; } }