    /*        Fast GEMM routine for Alpha 21164/21264      */
    /*         on  Linux, Digital UNIX                     */
    /*        by Kazushige Goto <goto@statabo.rim.or.jp>   */

#include <ctype.h>
#include <stdio.h>
#include "common.h"

static void zgemm_nn_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[i*2 + 0 +l*lda]
	        - b[l*2 + 1 +j*ldb]*a[i*2 + 1 +l*lda];
	temp_i += b[l*2 + 0 +j*ldb]*a[i*2 + 1 +l*lda]
	        + b[l*2 + 1 +j*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_nt_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[i*2 + 0 +l*lda]
	        - b[j*2 + 1 +l*ldb]*a[i*2 + 1 +l*lda];
	temp_i += b[j*2 + 0 +l*ldb]*a[i*2 + 1 +l*lda]
	        + b[j*2 + 1 +l*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_nr_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[i*2 + 0 +l*lda]
	        + b[l*2 + 1 +j*ldb]*a[i*2 + 1 +l*lda];
	temp_i += b[l*2 + 0 +j*ldb]*a[i*2 + 1 +l*lda]
	        - b[l*2 + 1 +j*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_nc_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[i*2 + 0 +l*lda]
	        + b[j*2 + 1 +l*ldb]*a[i*2 + 1 +l*lda];
	temp_i += b[j*2 + 0 +l*ldb]*a[i*2 + 1 +l*lda]
	        - b[j*2 + 1 +l*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_tn_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[l*2 + 0 +i*lda]
	        - b[l*2 + 1 +j*ldb]*a[l*2 + 1 +i*lda];
	temp_i += b[l*2 + 0 +j*ldb]*a[l*2 + 1 +i*lda]
	        + b[l*2 + 1 +j*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_tt_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[l*2 + 0 +i*lda]
	        - b[j*2 + 1 +l*ldb]*a[l*2 + 1 +i*lda];
	temp_i += b[j*2 + 0 +l*ldb]*a[l*2 + 1 +i*lda]
	        + b[j*2 + 1 +l*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_tr_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[l*2 + 0 +i*lda]
	        + b[l*2 + 1 +j*ldb]*a[l*2 + 1 +i*lda];
	temp_i += b[l*2 + 0 +j*ldb]*a[l*2 + 1 +i*lda]
	        - b[l*2 + 1 +j*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_tc_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[l*2 + 0 +i*lda]
	        + b[j*2 + 1 +l*ldb]*a[l*2 + 1 +i*lda];
	temp_i += b[j*2 + 0 +l*ldb]*a[l*2 + 1 +i*lda]
	        - b[j*2 + 1 +l*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_rn_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[i*2 + 0 +l*lda]
	        + b[l*2 + 1 +j*ldb]*a[i*2 + 1 +l*lda];
	temp_i +=-b[l*2 + 0 +j*ldb]*a[i*2 + 1 +l*lda]
	        + b[l*2 + 1 +j*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_rt_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[i*2 + 0 +l*lda]
	        + b[j*2 + 1 +l*ldb]*a[i*2 + 1 +l*lda];
	temp_i +=-b[j*2 + 0 +l*ldb]*a[i*2 + 1 +l*lda]
	        + b[j*2 + 1 +l*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_rr_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[i*2 + 0 +l*lda]
	        - b[l*2 + 1 +j*ldb]*a[i*2 + 1 +l*lda];
	temp_i +=-b[l*2 + 0 +j*ldb]*a[i*2 + 1 +l*lda]
	        - b[l*2 + 1 +j*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_rc_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[i*2 + 0 +l*lda]
	        - b[j*2 + 1 +l*ldb]*a[i*2 + 1 +l*lda];
	temp_i +=-b[j*2 + 0 +l*ldb]*a[i*2 + 1 +l*lda]
	        - b[j*2 + 1 +l*ldb]*a[i*2 + 0 +l*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_cn_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[l*2 + 0 +i*lda]
	        + b[l*2 + 1 +j*ldb]*a[l*2 + 1 +i*lda];
	temp_i +=-b[l*2 + 0 +j*ldb]*a[l*2 + 1 +i*lda]
	        + b[l*2 + 1 +j*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_ct_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[l*2 + 0 +i*lda]
	        + b[j*2 + 1 +l*ldb]*a[l*2 + 1 +i*lda];
	temp_i +=-b[j*2 + 0 +l*ldb]*a[l*2 + 1 +i*lda]
	        + b[j*2 + 1 +l*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_cr_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[l*2 + 0 +j*ldb]*a[l*2 + 0 +i*lda]
	        - b[l*2 + 1 +j*ldb]*a[l*2 + 1 +i*lda];
	temp_i +=-b[l*2 + 0 +j*ldb]*a[l*2 + 1 +i*lda]
	        - b[l*2 + 1 +j*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

static void zgemm_cc_(int m, int n, int k, FLOAT *alpha, FLOAT *a, int lda,
	     FLOAT *b, int ldb, FLOAT *c, int ldc){

  int i, j, l;

  FLOAT alpha_r, alpha_i;
  FLOAT temp_r, temp_i;

  alpha_r = *(alpha + 0);
  alpha_i = *(alpha + 1);

  /* Form  C := alpha*A*B + beta*C */
  for (j = 0; j < n; j++) {
    for (i = 0; i < m; i++) {
      temp_r = 0.;
      temp_i = 0.;
      for (l = 0; l < k; l++) {
	temp_r += b[j*2 + 0 +l*ldb]*a[l*2 + 0 +i*lda]
	        - b[j*2 + 1 +l*ldb]*a[l*2 + 1 +i*lda];
	temp_i +=-b[j*2 + 0 +l*ldb]*a[l*2 + 1 +i*lda]
	        - b[j*2 + 1 +l*ldb]*a[l*2 + 0 +i*lda];
      }
      c[i*2 + 0 + j*ldc] += alpha_r * temp_r - alpha_i * temp_i;
      c[i*2 + 1 + j*ldc] += alpha_r * temp_i + alpha_i * temp_r;
    }
  }
  
}

int ZGEMMC_(char *TRANSA, char *TRANSB, int *m, int *n, int *k,
	    FLOAT *alpha, FLOAT *a, int *LDA, FLOAT *b, int *LDB, 
	    FLOAT *beta, FLOAT *c, int *LDC){

  FLOAT temp_r, temp_i;
  
  /* Local variables */
  int i, j;
  int nota, notb;
  int conja, conjb;
  int transa, transb;
  int refa, refb;
  int lda = *LDA * 2;
  int ldb = *LDB * 2;
  int ldc = *LDC * 2;
  FLOAT alpha_r = *(alpha + 0);
  FLOAT alpha_i = *(alpha + 1);
  FLOAT beta_r  = *(beta  + 0);
  FLOAT beta_i  = *(beta  + 1);

  /* Function Body */
  nota   = (toupper(*TRANSA) == 'N');
  notb   = (toupper(*TRANSB) == 'N');
  transa = (toupper(*TRANSA) == 'T');
  transb = (toupper(*TRANSB) == 'T');
  refa   = (toupper(*TRANSA) == 'R');
  refb   = (toupper(*TRANSB) == 'R');
  conja  = (toupper(*TRANSA) == 'C');
  conjb  = (toupper(*TRANSB) == 'C');
  
  if ((*m == 0) || (*n == 0)) return 0;

  if ((beta_r != 1.0) || (beta_i != 0.)){
    if (beta_r == 0. && beta_i == 0.) {
      for (j = 0; j < *n; j++) {
	for (i = 0; i < (*m)*2; i+=2) {
	  c[i + j * ldc + 0] = 0.;
	  c[i + j * ldc + 1] = 0.;
	}
      }
    } else {
      for (j = 0; j < *n; j++) {
	for (i = 0; i < (*m) *2; i+= 2) {
	  temp_r = beta_r * c[i + j * ldc + 0] - beta_i * c[i + j * ldc + 1]; 
	  temp_i = beta_r * c[i + j * ldc + 1] + beta_i * c[i + j * ldc + 0];
	  c[i + j * ldc + 0] = temp_r;
	  c[i + j * ldc + 1] = temp_i;
	}
      }
    }
  }

  if ((alpha_r == 0. && alpha_i == 0.) || *k==0) return 0;

  if (nota) {
    if (notb)   zgemm_nn_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (transb) zgemm_nt_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (refb)   zgemm_nr_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (conjb)  zgemm_nc_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
  }

  if (transa) {
    if (notb)   zgemm_tn_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (transb) zgemm_tt_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (refb)   zgemm_tr_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (conjb)  zgemm_tc_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
  }

  if (refa) {
    if (notb)   zgemm_rn_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (transb) zgemm_rt_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (refb)   zgemm_rr_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (conjb)  zgemm_rc_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
  }

  if (conja) {
    if (notb)   zgemm_cn_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (transb) zgemm_ct_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (refb)   zgemm_cr_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
    if (conjb)  zgemm_cc_(*m, *n, *k, alpha, a, lda, b, ldb, c, ldc);
  }

  return 0;

}
