diff --git a/common_param.h b/common_param.h index 9e5edbb816..3245bae6ef 100644 --- a/common_param.h +++ b/common_param.h @@ -47,6 +47,9 @@ typedef struct { int dtb_entries; int switch_ratio; + int divide_rate; + int divide_limit; + int preferred_size; int offsetA, offsetB, align; #if BUILD_HFLOAT16 == 1 int shgemm_p, shgemm_q, shgemm_r; diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c index e37d86c28d..41e7d43e70 100644 --- a/driver/level3/gemm.c +++ b/driver/level3/gemm.c @@ -59,13 +59,21 @@ #define GEMM_Q 128 #endif -#ifdef GEMM_DIVIDE_RATE +#ifdef DYNAMIC_ARCH +#define DIVIDE_LIMIT gotoblas->divide_limit +#define DIVIDE_RATE gotoblas->divide_rate +#else +#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT #define DIVIDE_RATE GEMM_DIVIDE_RATE #endif -#ifdef GEMM_DIVIDE_LIMIT -#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT -#endif +//#ifdef GEMM_DIVIDE_RATE +//#define DIVIDE_RATE GEMM_DIVIDE_RATE +//#endif + +//#ifdef GEMM_DIVIDE_LIMIT +//#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT +//#endif #ifdef THREADED_LEVEL3 #include "level3_thread.c" diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 26d07fa944..318d7d553e 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -41,6 +41,7 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif @@ -93,7 +94,7 @@ typedef struct { #else volatile #endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -294,7 +295,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; - FLOAT *buffer[DIVIDE_RATE]; + FLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 1b656f902d..47f303c1d2 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -41,6 +41,8 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 + #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif @@ -69,7 +71,7 @@ _Atomic #else volatile #endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -133,7 +135,7 @@ _Atomic static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - FLOAT *buffer[DIVIDE_RATE]; + FLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG k, lda, ldc; BLASLONG m_from, m_to, n_from, n_to; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 327dc2d01d..83403aef70 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -41,12 +41,17 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 + #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif -#ifndef GEMM_PREFERED_SIZE -#define GEMM_PREFERED_SIZE 1 +#ifdef DYNAMIC_ARCH +#define GEMM_PREFERRED_SIZE gotoblas->preferred_size +#endif +#ifndef GEMM_PREFERRED_SIZE +#define GEMM_PREFERRED_SIZE 1 #endif //The array of job_t may overflow the stack. @@ -93,7 +98,7 @@ typedef struct { volatile - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -234,7 +239,7 @@ typedef struct { static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ - IFLOAT *buffer[DIVIDE_RATE]; + IFLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to; @@ -707,7 +712,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); - width = round_up(m, width, GEMM_PREFERED_SIZE); + width = round_up(m, width, GEMM_PREFERRED_SIZE); m -= width; @@ -758,7 +763,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < switch_ratio) { width = switch_ratio; } - width = round_up(width_n, width, GEMM_PREFERED_SIZE); + width = round_up(width_n, width, GEMM_PREFERRED_SIZE); width_n -= width; if (width_n < 0) { diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 51981c6253..044ececd18 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -54,6 +54,12 @@ gotoblas_t TABLE_NAME = { SWITCH_RATIO, + GEMM_DIVIDE_RATE, + + GEMM_DIVIDE_LIMIT, + + GEMM_PREFERRED_SIZE, + GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, #ifdef BUILD_HFLOAT16 diff --git a/param.h b/param.h index 7e4a04501b..4faaebff7c 100644 --- a/param.h +++ b/param.h @@ -630,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 4 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #ifdef ARCH_X86 @@ -1539,10 +1539,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 4 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #ifdef ARCH_X86 @@ -1665,10 +1665,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -1786,10 +1786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -1919,10 +1919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -2577,7 +2577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2616,7 +2616,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -3611,10 +3611,10 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #undef BGEMM_ALIGN_K @@ -3662,8 +3662,6 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) || defined(NEOVERSEV2) -#define GEMM_DIVIDE_LIMIT 3 - #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 #else @@ -3751,9 +3749,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DIVIDE_RATE 1 #if defined(XDOUBLE) || defined(DOUBLE) -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". @@ -4260,6 +4258,18 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SWITCH_RATIO 2 #endif +#ifndef GEMM_DIVIDE_RATE +#define GEMM_DIVIDE_RATE 2 +#endif + +#ifndef GEMM_DIVIDE_LIMIT +#define GEMM_DIVIDE_LIMIT 0 +#endif + +#ifndef GEMM_PREFERRED_SIZE +#define GEMM_PREFERRED_SIZE 1 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif