/* ========================== C MeatAxe =============================
   window.c -  Matrix window operations and Strassen-Winograd multiplication

   (C) Copyright 2015 Simon King, Institut fuer Mathematik,
   FSU Jena, Germany  <simon.king@uni-jena.de>
   This program is free software; see the file COPYING for details.
   ================================================================== */

#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include "meataxe.h"

/**
 ** @defgroup matwin Strassen-Winograd multiplication
 ** @{
 ** @details
 ** A matrix window is a rectangular part of a matrix. As an implementation
 ** detail, each row of the window must be formed by longs.
 ** Hence, not all upper left corners are allowed and not all row sizes
 ** are possible.
 **
 ** Matrix windows are used in the asymptoticall fast Strassen-Winograd
 ** multiplication algorithm, that is a divide-and-conquer algorithm.
 ** In order to avoid the allocation of additional matrices (thus, in order
 ** to be efficient), intermediate results are stored in the same matrix
 ** that will eventuall contain the final result of the multiplication.
 **
 ** We use the memory efficient schedule of Douglas-Heroux-Slishman-Smith
 ** described in @ref BDPZ09 "[BDPZ09]".
 **
 **/


/* --------------------------------------------------------------------------
   Local data
   -------------------------------------------------------------------------- */

MTX_DEFINE_FILE_INFO

typedef unsigned char BYTE;

/** Matrix Window.
The MatrixWindow_t structure represents a rectangular part of a matrix.
***/
typedef struct
{
  int Nor;                      /**< number of rows of the window */
  size_t RowSize;               /**< size of window rows (number of longs in a contiguous memory chunk) */
  Matrix_t *Matrix;             /**< ambient matrix containing the window */
  PTR ULCorner;                 /**< Pointer to the upper left window corner */
}
    MatrixWindow_t;

size_t cutoff = sizeof(long)/2;

/**
 ** Set the cutoff for Winograd-Strassen multiplication.
 ** The divide-and-conquer approach is only done for
 ** matrices with at least "cutoff*MPB*sizeof(long)" rows which
 ** are formed by at least "cutoff" longs.
 ** That rule means that the "critical matrices" are roughly square.
 **
 ** If @a size is zero, the default cutoff "sizeof(long)/2" is used.
 ** @param size New cutoff.
 **/

void StrassenSetCutoff(size_t size)
{   if (size)
        cutoff = size;
    else
        cutoff = sizeof(long)/2;
}

/* ------------------------------------------------------------------

   Allocation and deallocation of a matrix window

   ------------------------------------------------------------------ */


/**
 ** Allocation and null initialisation of a matrix window.
 **
 ** Note that the rowsize is given in long, not in byte. The reason is
 ** functions such as FfAddRowPartial() or FfAddMapRowWindow() are internally
 ** operating on longs. By consequence, in the Strassen-Winograd
 ** multiplication algorithm, we have to divide our matrix rows
 ** into longs, not into bytes.
 ** @param fl Field size.
 ** @param nor Number of rows.
 ** @param rowsize Rows are formed by @a rowsize longs in memory.
 ** @return Matrix window, with pointer in the upper left corner, initialised to zero.
 **/

MatrixWindow_t *WindowAlloc(int fl, int nor, size_t rowsize)
{
    MatrixWindow_t *out;
    out = ALLOC(MatrixWindow_t);
    if (out == NULL)
    {
        MTX_ERROR1("%E",MTX_ERR_NOMEM);
        return NULL;
    }
    if (FfSetField(fl))
    {
        free(out);
        return NULL;
    }
    out->Matrix = MatAlloc(fl, nor, rowsize*sizeof(long)*MPB);
    if (out->Matrix == NULL)
    {
        free(out);
        MTX_ERROR1("%E",MTX_ERR_NOMEM);
        return NULL;
    }
    out->ULCorner = out->Matrix->Data;
    out->Nor = nor;
    out->RowSize = rowsize;
    return out;
}

/**
 ** Release the memory used by this window.
 ** @attention Only to be used if the surrounding matrix can be destroyed! Otherwise, just do free(@a m)!
 ** @param m Matrix window to be freed.
 **/
void WindowFree(MatrixWindow_t *m)
{
    if (m->Matrix != NULL)
      {
    MatFree(m->Matrix);
      }
    free(m);
}

/* ------------------------------------------------------------------
 * Auxiliary / Debugging
 ----------------------------------------------------------------- */
/*
void WindowShow(MatrixWindow_t *A)
{
long i,j;
PTR p = A->ULCorner;
FfSetNoc(A->Matrix->Noc);
for (i=A->Nor; i>0; i--, FfStepPtr(&p))
  {
  for (j=0; j< (A->RowSize)*sizeof(long); j++)
    printf("%3.3d ", (unsigned char)p[j]);
  printf("\n");
  }
}
*/

/**
 ** Overwrite the window by zeroes, but let the rest of the ambient matrix untouched.
 ** @param A The matrix window to be zeroed out.
 **/

void WindowClear(MatrixWindow_t *A)
{
register long i;
register size_t rowsize = A->RowSize*sizeof(long);
PTR p = A->ULCorner;
FfSetNoc(A->Matrix->Noc);
for (i=A->Nor; i>0; i--, FfStepPtr(&p))
{ memset(p, FF_ZERO, rowsize); }
}

/**
 ** Multiply a vector by a matrix window.
 ** This function multiplies the vector @a row from the right by the matrix window
 ** pointed at by @a matrix and @em adds the result into @a result. The number of columns
 ** in both @a matrix and @a result is determined by @a rowsize.
 ** @attention @a result and @a row must not overlap. Otherwise the result is undefined!
 ** @param row The source vector, formed by @a nor columns.
 ** @param matrix A pointer to a mark in a matrix, defining a window in a matrix whose rowsize is FfCurrRowSize.
 *  ( nor by (rowsize*sizeof(long)*MPB)) of a matrix whose
 ** @param nor Number of rows or the matrix window.
 ** @param[out] result The vector which @a row * @a matrix is added to. It has @a rowsize * sizeof(long) * MPB columns.
 ** @param rowsize Number of longs forming a row of @a matrix.
 **/

void FfAddMapRowWindow(PTR row, PTR matrix, int nor, PTR result, size_t rowsize)

{
    register int i;
    register FEL f;
    BYTE *m = (BYTE *) matrix;

#ifdef DEBUG
    if (result >= row && result < row + FfRowSize(nor))
    MTX_ERROR("row and result overlap: undefined result!");
    if (row >= result && row < result + (rowsize*sizeof(long)))
    MTX_ERROR("row and result overlap: undefined result!");
#endif

    if (FfOrder == 2)       /* GF(2) is a special case */
    {
        register long *x1 = (long *) matrix;
        register BYTE *r = (BYTE *) row;

        for (i = nor; i > 0; ++r)
        {
            register BYTE mask;
            if (*r == 0)
            {
                i -= 8;
                x1 += 8 * LPR;  /* Skip 8 rows of the matrix window in the ambient matrix*/
                continue;
            }
            for (mask = 0x80; mask != 0 && i > 0; mask >>= 1, --i)
            {
                if ((mask & *r) == 0)
                {
                    x1 += LPR;  /* Skip a single row */
                    continue;
                }

#ifdef ASM_MMX
__asm__("    pushl %ebx\n");
__asm__("    movl %0,%%ebx" : : "g" (x1) );
__asm__("    pushl %ecx\n"
    "    pushl %edx\n"
    "    movl 20(%ebp),%ecx\n"  /* result */
    );
__asm__ (
        "    movl 24(%ebp),%edx\n"   /* this time, it is rowsize, not LPR */
        "    sarl $1,%edx\n"
        "    je .FASTXOR_1\n"
        "    .align 16\n"
    ".FASTXOR_2:\n"
        "    movq (%ebx),%mm0\n"
        "    addl $8,%ebx\n"
        "    pxor (%ecx),%mm0\n"
        "    movq %mm0,(%ecx)\n"
        "    addl $8,%ecx\n"
        "    decl %edx\n"
        "    jne .FASTXOR_2\n"
    ".FASTXOR_1:\n"
    "    popl %edx\n"
    "    popl %ecx\n");
__asm__("    movl %%ebx,%0" : : "g" (x1) );
__asm__("    popl %ebx\n"
    );
#else
                {
                    register long *x2 = (long *)result;
                    register int k;
                    for (k = rowsize; k; --k)
                        *x2++ ^= *x1++;
                    /* Now, x1 points to the first item
                     * after the current line of the window.
                     * We need to move it to the first position
                     * of the next line of the window.
                     */
                    x1 += (LPR-rowsize);
                }
#endif
            }
        }
    }
    else                /* Any other field */
    {
        register BYTE *brow = (BYTE *) row;
        register int pos = 0;
        size_t l_rowsize = rowsize*sizeof(long);
        for (i = nor; i > 0; --i)
        {
            f = mtx_textract[pos][*brow];
            if (++pos == (int) MPB)
            {
                pos = 0;
                ++brow;
            }
            if (f != FF_ZERO)
            {
                register BYTE *v = m;
                register BYTE *r = result;
                register BYTE x;
                if (f == FF_ONE)
                {
                    register size_t k = l_rowsize;
                    for (; k != 0; --k)
                    {
                        x=*v++;
                        if (x) *r = mtx_tadd[*r][x];
                        ++r;
                    }
                }
                else
                {
                    register BYTE *multab = mtx_tmult[f];
                    register size_t k = l_rowsize;
                    for (; k != 0; --k)
                    {
                        x=*v++;
                        if (x) *r = mtx_tadd[multab[x]][*r];
                        ++r;
                    }
                }
            }
            m += FfCurrentRowSize;  /* next row of window in the ambient matrix */
        }
    }
}

/**
 ** Add two matrix windows @a left and @a right and put the result into @a dest.
 ** @a left and @a right must be distinct, but one of them may coincide with @a dest
 ** @param[out] dest Matrix window which the results is to be assigned to.
 ** @param left Matrix window.
 ** @param right Matrix window.
 ** @return Either @a dest or the NULL pointer on error (the only error may occur in a compatibility check).
 **/
MatrixWindow_t *WindowSum(MatrixWindow_t *dest, MatrixWindow_t *left, MatrixWindow_t *right)
{
  PTR x, result, tmp;
  int i;

  int lnoc, rnoc, dnoc;

  FfSetField(left->Matrix->Field);
  if (left->Matrix->Field != right->Matrix->Field || (left->Nor != right->Nor) || (left->RowSize != right->RowSize))
    {
      MTX_ERROR1("Windows cannot be added: %E", MTX_ERR_INCOMPAT);
      return NULL;
    }
  size_t rowsize = left->RowSize;

  lnoc = left->Matrix->Noc;
  rnoc = right->Matrix->Noc;
  dnoc = dest->Matrix->Noc;
  /* We have to distinguish cases as to whether dest
     is equal to either left or right */
  result = dest->ULCorner;
  if (left->ULCorner == dest->ULCorner)
    {   /* we write into left */
        x = right->ULCorner;
        for (i = left->Nor; i != 0; --i)
        {
            FfAddRowPartial(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(rnoc);
            FfStepPtr(&x);
        }
    }
  else if (right->ULCorner == dest->ULCorner)
    {   /* we write into right */
        x = left->ULCorner;
        for (i = left->Nor; i != 0; --i)
        {
            FfAddRowPartial(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(lnoc);
            FfStepPtr(&x);
        }
    }
  else
    {   /* we need to copy left into dest first */
        x = right->ULCorner;
        tmp = left->ULCorner;
        size_t l_rowsize = rowsize * sizeof(long);
        for (i = left->Nor; i != 0; --i)
        {
            memcpy(result, tmp, l_rowsize);
            FfSetNoc(lnoc);
            FfStepPtr(&tmp);
            FfAddRowPartial(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(rnoc);
            FfStepPtr(&x);
        }
    }
  return dest;
}

/**
 ** Subtract two matrix windows @a left and @a right and put the result into @a dest.
 ** @a left and @a right must be distinct, but one of them may coincide with @a dest
 ** @param[out] dest Matrix window which the results is to be assigned to.
 ** @param left Matrix window.
 ** @param right Matrix window.
 ** @return Either @a dest or the NULL pointer on error (the only error may occur in a compatibility check).
 **/
MatrixWindow_t *WindowDif(MatrixWindow_t *dest, MatrixWindow_t *left, MatrixWindow_t *right)
{
  PTR x, result, tmp;
  int i;
  int lnoc, rnoc, dnoc;

  FfSetField(left->Matrix->Field);
  if (left->Matrix->Field != right->Matrix->Field || (left->Nor != right->Nor) || (left->RowSize != right->RowSize))
    {
      MTX_ERROR1("Windows cannot be subtracted: %E", MTX_ERR_INCOMPAT);
      return NULL;
    }
  size_t rowsize = left->RowSize;

  lnoc = left->Matrix->Noc;
  rnoc = right->Matrix->Noc;
  dnoc = dest->Matrix->Noc;
  /* We have to distinguish cases as to whether dest
     is equal to either left or right */
  result = dest->ULCorner;
  if (left->ULCorner == dest->ULCorner)
    {   /* we write into left */
        x = right->ULCorner;
        for (i = left->Nor; i != 0; --i)
        {
            FfSubRowPartial(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(rnoc);
            FfStepPtr(&x);
        }
    }
  else if (right->ULCorner == dest->ULCorner)
    {   /* we write into right */
        x = left->ULCorner;
        for (i = left->Nor; i != 0; --i)
        {
            FfSubRowPartialReverse(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(lnoc);
            FfStepPtr(&x);
        }
    }
  else
    {   /* we need to copy left into dest first */
        x = right->ULCorner;
        tmp = left->ULCorner;
        size_t l_rowsize = rowsize * sizeof(long);
        for (i = left->Nor; i != 0; --i)
        {
            memcpy(result, tmp, l_rowsize);
            FfSetNoc(lnoc);
            FfStepPtr(&tmp);
            FfSubRowPartial(result, x, 0, rowsize);
            FfSetNoc(dnoc);
            FfStepPtr(&result);
            FfSetNoc(rnoc);
            FfStepPtr(&x);
        }
    }
  return dest;
}

/**
 ** Add @a left * @a right to @a dest.
 **
 ** It is assumed that @a dest->Matrix is allocated with the correct field
 ** and dimensions as well, so that we can write the result into it. Moreover,
 ** the chunk of memory pointed at by @a dest @em must be disjoint
 ** from the chunks for @a left and @a right! Compatibility of dimensions
 ** is not tested

 ** @param[out] dest Matrix window which the product of @a left and @a right is added to.
 ** @param left Matrix window.
 ** @param right Matrix window.
 ** @return @a dest, there is no error return value.
 **/
MatrixWindow_t *WindowAddMul(MatrixWindow_t *dest, MatrixWindow_t *left, MatrixWindow_t *right)
{
    PTR x, y, result;
    long i;

    FfSetField(left->Matrix->Field);
    x = left->ULCorner;
    y = right->ULCorner;
    result = dest->ULCorner;

    for (i = dest->Nor; i != 0; --i)
    {
      /* Set the noc of the surrounding matrix of the right factor,
     which is assumed by zmaprow_window */
      FfSetNoc(right->Matrix->Noc);
      FfAddMapRowWindow(x, y, right->Nor, result, right->RowSize);
      /* We want to step to the next line of the left factor */
      FfSetNoc(left->Matrix->Noc);
      FfStepPtr(&x);
      /* We want to step to the next line of the result */
      FfSetNoc(dest->Matrix->Noc);
      FfStepPtr(&result);
    }
    /*
      dest->RowSize = right->RowSize;
      dest->Nor = left->Nor;
    */
    return dest;
}

/** Create a window of a matrix.
 ** @param out An allocated matrix window whose data fields will be filled with new data.
 ** @param M matrix.
 ** @param nor Number of rows of the window.
 ** @param rowsize Rowsize of the window, which must correspond to a block of longs in memory.
 ** @param p Pointer to the upper left corner of the window
 **/
inline void MatrixToWindow (MatrixWindow_t *out, const Matrix_t *M, long nor, long rowsize, PTR p)
/* presumably M will be freed separately. Hence, use free(...) to free
   the result of this function
*/
{
  out->Matrix = M;
  out->Nor = nor;
  out->RowSize = rowsize;
  out->ULCorner = p;
}

/**
 ** Multiply matrix windows.
 ** This function multiplies @a A_win from the right by @a B_win and writes
 ** the result into @a dest_win.
 ** @attention @a dest must be initialised to zero!
 **
 ** The matrix windows must be compatible for multiplication, i.e. they must be over
 ** the same field, and the number of columns of @a A_win must be equal to the
 ** number of rows of @a B_win.
 **
 ** Moreover, it is assumed that @a dest_win is allocated in the right dimensions.
 ** Since parts of @a dest_win are used to store temporary results, it is essential
 ** that @a dest_win initially is zero.
 ** @param[out] dest_win Result.
 ** @param A_win Left factor.
 ** @param B_win Right factor
 ** @return The function returns 0 on success and a nonzero value on error.
 **/

int StrassenStep(MatrixWindow_t *dest_win, MatrixWindow_t *A_win, MatrixWindow_t *B_win)
{
  FfSetField(A_win->Matrix->Field);
  int MPL = MPB*sizeof(long);
  int full_nrow_cutoff = cutoff*MPL;
  /* Determine the size of submatrices in divide-and-conquer */
  /**
   * Note that the rowsize is given in the unit "long".
   * Generally we have trailing padding empty bytes. We have to cut
   * so that two full blocks fit into the non-padded area. This is what we do:
   * - We halve the number of rows of A (rounded down).
   * - We halve the rowsize of B (rounded down) , since padding doesn't matter here.
   * - We determine how many FULL longs fit into a *row* (of A) of B->Nor items.
   *   Half of it (rounded down) gives the rowsize of A's submatrices.
   * - From that rowsize, we obtain the corresponding number of rows of
   *   B's submatrices.
   **/
  /*
  printf("we start with A_win\n");
  WindowShow(A_win);
  */
  int A_sub_nrows = A_win->Nor/2;
  size_t B_sub_rowsize = B_win->RowSize/2;
  size_t A_sub_rowsize = (B_win->Nor/MPL)/2;
  int B_sub_nrows = A_sub_rowsize*MPL;
  /*printf("A_sub_nrows %d\nA_subrowsize %d\nB_sub_nrows %d\nB_sub_rowsize %d\n", A_sub_nrows,A_sub_rowsize,B_sub_nrows,B_sub_rowsize);*/

  /* If the submatrices were too small, we use school book multiplication */
  if ((A_sub_nrows < full_nrow_cutoff) ||
      (B_sub_nrows < full_nrow_cutoff) ||
      (A_sub_rowsize < cutoff) ||
      (B_sub_rowsize < cutoff))
    {
      /* The ambient matrix of dest_win is supposed to be empty. Thus, we add rather than overwrite */
      /* printf("Classical for %d x %d and %d x %d\n", A_win->Nor, A_win->RowSize*MPB*sizeof(long), B_win->Nor, B_win->RowSize*MPB*sizeof(long));*/
      WindowAddMul(dest_win, A_win, B_win);
      return 0;
    }
  /* printf("Strassen step for %d x %d and %d x %d\n", A_win->Nor, A_win->RowSize*MPB*sizeof(long), B_win->Nor, B_win->RowSize*MPB*sizeof(long));*/
  size_t B_sub_rowsize2 = B_sub_rowsize + B_sub_rowsize;
  size_t A_sub_rowsize2 = A_sub_rowsize + A_sub_rowsize;
  size_t B_sub_rowsize2b = B_sub_rowsize2*sizeof(long); /* size in byte */
  size_t A_sub_rowsize2b = A_sub_rowsize2*sizeof(long);
  int B_sub_nrows2 = B_sub_nrows + B_sub_nrows;
  int A_sub_nrows2 = A_sub_nrows + A_sub_nrows;

  Matrix_t *A, *B, *dest;
  A = A_win->Matrix;
  B = B_win->Matrix;
  dest = dest_win->Matrix;

  /* Because of rounding, there are stripes on the right
   * and the lower boundary that are not part of the
   * clean divide-and-conquer algorithm.
   * */
  int A_nrows_rem = A_win->Nor - A_sub_nrows2;
  size_t A_rowsize_rem = A_win->RowSize - A_sub_rowsize2;

  int B_nrows_rem = B_win->Nor - B_sub_nrows2;
  size_t B_rowsize_rem = B_win->RowSize - B_sub_rowsize2;

  /* ----------------------------------------------------
   * Allocate temporary space.
   * We use a schedule introduced by Douglas-Heroux-Slishman-Smith
   * (see also Boyer-Pernet-Zhou, "Memory efficient scheduling of
   * Strassen-Winograd's matrix multiplication algorithm", Table 1).
     ---------------------------------------------------- */

  MatrixWindow_t *X, *Y;
  if (A_sub_rowsize>B_sub_rowsize)
    {
      X = WindowAlloc(A->Field, A_sub_nrows, A_sub_rowsize); }
  else
    {
      X = WindowAlloc(A->Field, A_sub_nrows, B_sub_rowsize); }
  if (X == NULL)
  {  MTX_ERROR1("Error allocating a temporary window: %E",MTX_ERR_NOMEM);
     return 1;
  }
  Y = WindowAlloc(A->Field, B_sub_nrows, B_sub_rowsize);
  if (Y == NULL)
    {
      WindowFree(X);
      MTX_ERROR1("Error allocating a temporary window: %E",MTX_ERR_NOMEM);
      return 1;
    }

  /* Define the sub-windows of A, B and dest */
  /*
  printf("original windows\n");
  printf("A\n");
  WindowShow(A_win);
  printf("B\n");
  WindowShow(B_win);
  printf("dest\n");
  WindowShow(dest_win);
  printf("scratch X\n");
  WindowShow(X);
  printf("scratch Y\n");
  WindowShow(Y);
  */
  FfSetNoc(A->Noc);
  MatrixWindow_t A00[1], A01[1], A10[1], A11[1], B00[1], B01[1], B10[1], B11[1];
  MatrixWindow_t A_last_col[1], A_last_row[1];
  MatrixWindow_t B_last_col[1], B_last_row[1], B_bulk[1];
  MatrixWindow_t dest_last_col[1], dest_last_row[1], dest_bulk[1];
  MatrixToWindow(A00, A, A_sub_nrows, A_sub_rowsize, A_win->ULCorner);
  MatrixToWindow(A01, A, A_sub_nrows, A_sub_rowsize, (PTR)((char*)(A_win->ULCorner)+A_sub_rowsize*sizeof(long)));
  MatrixToWindow(A10, A, A_sub_nrows, A_sub_rowsize, FfGetPtr(A_win->ULCorner, A_sub_nrows));
  MatrixToWindow(A11, A, A_sub_nrows, A_sub_rowsize,
          (PTR)((char*)(A_win->ULCorner)+(A_sub_nrows*FfCurrentRowSize+A_sub_rowsize*sizeof(long))));
  /*
  printf("A00\n");
  WindowShow(A00);
  printf("A01\n");
  WindowShow(A01);
  printf("A10\n");
  WindowShow(A10);
  printf("A11\n");
  WindowShow(A11);
  */
  FfSetNoc(B->Noc);
  MatrixToWindow(B00, B, B_sub_nrows, B_sub_rowsize, B_win->ULCorner);
  MatrixToWindow(B01, B, B_sub_nrows, B_sub_rowsize, (PTR)((char*)(B_win->ULCorner)+B_sub_rowsize*sizeof(long)));
  MatrixToWindow(B10, B, B_sub_nrows, B_sub_rowsize, FfGetPtr(B_win->ULCorner, B_sub_nrows));
  MatrixToWindow(B11, B, B_sub_nrows, B_sub_rowsize,
          (PTR)((char*)(B_win->ULCorner)+(B_sub_nrows*FfCurrentRowSize+B_sub_rowsize*sizeof(long))));
  /*
  printf("B00\n");
  WindowShow(B00);
  printf("B01\n");
  WindowShow(B01);
  printf("B10\n");
  WindowShow(B10);
  printf("B11\n");
  WindowShow(B11);
  */
  FfSetNoc(dest->Noc);  // since we may multiply into X, the size is not necessarily the same as for B.
  PTR dest00 = dest_win->ULCorner;
  PTR dest01 = (PTR)((char*)(dest_win->ULCorner)+B_sub_rowsize*sizeof(long));
  PTR dest10 = FfGetPtr(dest_win->ULCorner,A_sub_nrows);
  PTR dest11 = (PTR)((char*)(dest_win->ULCorner)+(A_sub_nrows*FfCurrentRowSize)+B_sub_rowsize*sizeof(long));

  /* Matrix windows containing temporary results */
  MatrixWindow_t S0[1], S1[1], S2[1], S3[1], T0[1], T1[1], T2[1], T3[1], P0[1], P1[1], P2[1], P3[1], P4[1], P5[1], P6[1], U0[1], U1[1], U2[1], U3[1], U4[1], U5[1], U6[1];

  /* 1.  S2 = A00-A10 in X */
  S2->Nor = A_sub_nrows;
  S2->RowSize = A_sub_rowsize;
  S2->Matrix = X->Matrix;
  S2->ULCorner = X->ULCorner;
  WindowDif(S2, A00, A10); /* No error checking, as we know that the windows are compatible */
  /*
  printf("1.  S2 = A00-A10 in X\n");
  WindowShow(X);
  printf("resp.\n");
  WindowShow(S2);
  */

  /* 2.  T2 = B11-B01 in Y */
  T2->Nor = B_sub_nrows;
  T2->RowSize = B_sub_rowsize;
  T2->Matrix = Y->Matrix;
  T2->ULCorner = Y->ULCorner;
  WindowDif(T2, B11, B01);
  /*
  printf("2.  T2 = B11-B01 in Y\n");
  WindowShow(Y);
  */

  /* 3.  P6 = S2*T2 in dest10 */
  P6->Nor = A_sub_nrows;
  P6->RowSize = B_sub_rowsize;
  P6->Matrix = dest;
  P6->ULCorner = dest10;
  /* dest is supposed to be empty */
  if (StrassenStep(P6, S2, T2)) return 1;
  /*
  printf("3.  P6 = S2*T2 in dest10\n");
  WindowShow(dest_win);
  */

  /* 4.  S0 = A10+A11 in X */
  S0->Nor = A_sub_nrows;
  S0->RowSize = A_sub_rowsize;
  S0->Matrix = X->Matrix;
  S0->ULCorner = X->ULCorner;
  WindowSum(S0, A10, A11); /* no error checking here and below, as we know the dimensions of the windows */
  /*
  printf("4.  S0 = A10+A11 in X\n");
  WindowShow(X);
  */

  /* 5.  T0 = B01-B00 in Y */
  T0->Nor = B_sub_nrows;
  T0->RowSize = B_sub_rowsize;
  T0->Matrix = Y->Matrix;
  T0->ULCorner = Y->ULCorner;
  WindowDif(T0, B01, B00);
  /*
  printf("5.  T0 = B01-B00 in Y\n");
  WindowShow(Y);
  */

  /* 6.  P4 = S0*T0 in dest11 */
  P4->Nor = A_sub_nrows;
  P4->RowSize = B_sub_rowsize;
  P4->Matrix = dest;
  P4->ULCorner = dest11;
  /* dest is supposed to be empty */
  if (StrassenStep(P4, S0, T0)) return 1;
  /*
  printf("6.  P4 = S0*T0 in dest11\n");
  WindowShow(dest_win);
  */

  /* 7.  S1 = S0-A00 in X */
  S1->Nor = A_sub_nrows;
  S1->RowSize = A_sub_rowsize;
  S1->Matrix = X->Matrix;
  S1->ULCorner = X->ULCorner;
  WindowDif(S1, S0, A00);
  /*
  printf("7.  S1 = S0-A00 in X\n");
  WindowShow(X);
  */

  /* 8.  T1 = B11-T0 in Y */
  T1->Nor = B_sub_nrows;
  T1->RowSize = B_sub_rowsize;
  T1->Matrix = Y->Matrix;
  T1->ULCorner = Y->ULCorner;
  WindowDif(T1, B11, T0);
  /*
  printf("8.  T1 = B11-T0 in Y\n");
  WindowShow(Y);
  */

  /* 9.  P5 = S1*T1 in dest01 */
  P5->Nor = A_sub_nrows;
  P5->RowSize = B_sub_rowsize;
  P5->Matrix = dest;
  P5->ULCorner = dest01;
  /* dest is supposed to be empty */
  if (StrassenStep(P5, S1, T1)) return 1;
  /*
  printf("9.  P5 = S1*T1 in dest01\n");
  WindowShow(dest_win);
  */

  /*10.  S3 = A01-S1 in X */
  S3->Nor = A_sub_nrows;
  S3->RowSize = A_sub_rowsize;
  S3->Matrix = X->Matrix;
  S3->ULCorner = X->ULCorner;
  WindowDif(S3, A01, S1);
  /*
  printf("10.  S3 = A01-S1 in X\n");
  WindowShow(X);
  */

  /*11.  P2 = S3*B11 in dest00 */
  P2->Nor = A_sub_nrows;
  P2->RowSize = B_sub_rowsize;
  P2->Matrix = dest;
  P2->ULCorner = dest00;
  /* That part of dest is still supposed to be empty */
  if (StrassenStep(P2, S3, B11)) return 1;
  /*
  printf("11.  P2 = S3*B11 in dest00\n");
  WindowShow(dest_win);
  */

  /*12.  P0 = A00*B00 in X */
  P0->Nor = A_sub_nrows;
  P0->RowSize = B_sub_rowsize;
  P0->Matrix = X->Matrix;
  P0->ULCorner = X->ULCorner;
  /*
     This time, the matrix we write our product to may be non-empty.
     Hence, we clear the destination first.
  */
  WindowClear(P0);
  if (StrassenStep(P0, A00, B00)) return 1;
  /*
  printf("12. P0 = A00*B00 in X\n");
  WindowShow(X);
  */

  /*13.  U1 = P0+P5 in dest01 */
  U1->Nor = A_sub_nrows;
  U1->RowSize = B_sub_rowsize;
  U1->Matrix = dest;
  U1->ULCorner = dest01;
  WindowSum(U1, P0, P5);
  /*
  printf("13.  U1 = P0+P5 in dest01\n");
  WindowShow(dest_win);
  */

  /*14.  U2 = U1+P6 in dest10 */
  U2->Nor = A_sub_nrows;
  U2->RowSize = B_sub_rowsize;
  U2->Matrix = dest;
  U2->ULCorner = dest10;
  WindowSum(U2, U1, P6);
  /*
  printf("14.  U2 = U1+P6 in dest10\n");
  WindowShow(dest_win);
  */

  /*15.  U3 = U1+P4 in dest01 */
  U3->Nor = A_sub_nrows;
  U3->RowSize = B_sub_rowsize;
  U3->Matrix = dest;
  U3->ULCorner = dest01;
  WindowSum(U3, U1, P4);
  /*
  printf("15.  U3 = U1+P4 in dest01\n");
  WindowShow(dest_win);
  */

  /*16.  U6 = U2+P4 in dest11 (final) */
  U6->Nor = A_sub_nrows;
  U6->RowSize = B_sub_rowsize;
  U6->Matrix = dest;
  U6->ULCorner = dest11;
  WindowSum(U6, U2, P4);
  /*
  printf("16.  U6 = U2+P4 in dest11 (final)\n");
  WindowShow(dest_win);
  */

  /*17.  U4 = U3+P2 in dest01 (final) */
  U4->Nor = A_sub_nrows;
  U4->RowSize = B_sub_rowsize;
  U4->Matrix = dest;
  U4->ULCorner = dest01;
  WindowSum(U4, U3, P2);
  /*
  printf("17.  U4 = U3+P2 in dest01 (final)\n");
  WindowShow(dest_win);
  */

  /*18.  T3 = T1-B10 in Y */
  T3->Nor = B_sub_nrows;
  T3->RowSize = B_sub_rowsize;
  T3->Matrix = Y->Matrix;
  T3->ULCorner = Y->ULCorner;
  WindowDif(T3, T1, B10);
  /*
  printf("18.  T3 = T1-B10 in Y\n");
  WindowShow(Y);
  */

  /*19.  P3 = A11*T3 in dest00 */
  P3->Nor = A_sub_nrows;
  P3->RowSize = B_sub_rowsize;
  P3->Matrix = dest;
  P3->ULCorner = dest00;
  /* Meanwhile dest00 is non-empty. Hence, overwrite */
  WindowClear(P3);
  if (StrassenStep(P3, A11, T3)) return 1;
  /*
  printf("19.  P3 = A11*T3 in dest00\n");
  WindowShow(dest_win);
  */

  /*20.  U5 = U2-P3 in dest10 (final) */
  U5->Nor = A_sub_nrows;
  U5->RowSize = B_sub_rowsize;
  U5->Matrix = dest;
  U5->ULCorner = dest10;
  WindowDif(U5, U2, P3);
  /*
  printf("20.  U5 = U2-P3 in dest10 (final)\n");
  WindowShow(dest_win);
  */

  /*21.  P1 = A01*B10 in dest00 */
  P1->Nor = A_sub_nrows;
  P1->RowSize = B_sub_rowsize;
  P1->Matrix = dest;
  P1->ULCorner = dest00;
  /* Again, we need to overwrite */
  WindowClear(P1);
  if (StrassenStep(P1, A01, B10)) return 1;
  /*
  printf("21.  P1 = A01*B10 in dest00\n");
  WindowShow(dest_win);
  */

  /*22.  U0 = P0+P1 in dest00 (final) */
  U0->Nor = A_sub_nrows;
  U0->RowSize = B_sub_rowsize;
  U0->Matrix = dest;
  U0->ULCorner = dest00;
  WindowSum(U0, P0, P1);
  /*
  printf("22.  U0 = P0+P1 in dest00 (final)\n");
  WindowShow(dest_win);
  */
  WindowFree(X);
  WindowFree(Y);

  /* ---------------------------------------------------------
     Deal with the leftovers on the bottom and the right wing
     --------------------------------------------------------- */

  if (B_rowsize_rem)
    {
      MatrixToWindow(B_last_col, B, B_win->Nor, B_rowsize_rem, (PTR)((char*)(B_win->ULCorner) + B_sub_rowsize2b));
      MatrixToWindow(dest_last_col, dest, A_win->Nor, B_rowsize_rem, (PTR)((char*)(dest_win->ULCorner) + B_sub_rowsize2b));
      /* that part of dest is still supposed to be empty, so we can add the product */
      WindowAddMul(dest_last_col, A_win, B_last_col);
    }
  if (A_nrows_rem)
    {
      FfSetNoc(A->Noc);
      MatrixToWindow(A_last_row, A, A_nrows_rem, A_win->RowSize, (PTR)((char*)(A_win->ULCorner) + (A_sub_nrows2*FfCurrentRowSize)));
      if (B_rowsize_rem) /* We have already considered the lower right corner in the previous if-clause */
      {
          MatrixToWindow(B_bulk, B, B_win->Nor, B_sub_rowsize2, B_win->ULCorner);
          FfSetNoc(dest->Noc);
          MatrixToWindow(dest_last_row, dest, A_nrows_rem, B_sub_rowsize2, (PTR)((char*)(dest_win->ULCorner) + (A_sub_nrows2*FfCurrentRowSize)));
          /* that part of dest is still supposed to be empty, so we can add the product */
          WindowAddMul(dest_last_row, A_last_row, B_bulk);
      }
      else
      {
          FfSetNoc(dest->Noc);
          MatrixToWindow(dest_last_row, dest, A_nrows_rem, B_win->RowSize, (PTR)((char*)(dest_win->ULCorner) + (A_sub_nrows2*FfCurrentRowSize)));
          /* that part of dest is still supposed to be empty, so we can add the product */
          WindowAddMul(dest_last_row, A_last_row, B_win);
      }
    }
  if (A_rowsize_rem)
    { /* By the above operations, we don't need to consider the lower right corner of either A or B. */
      MatrixToWindow(A_last_col, A, A_sub_nrows2, A_rowsize_rem, (PTR)((char*)(A_win->ULCorner) + A_sub_rowsize2b));
      FfSetNoc(B->Noc);
      MatrixToWindow(B_last_row, B, B_nrows_rem, B_sub_rowsize2, (PTR)((char*)(B_win->ULCorner) + (B_sub_nrows2*FfCurrentRowSize)));
      FfSetNoc(dest->Noc);
      MatrixToWindow(dest_bulk, dest, A_sub_nrows2, B_sub_rowsize2, dest_win->ULCorner);
      /* now we are supposed to add the product to the result obtained so far */
      WindowAddMul(dest_bulk, A_last_col, B_last_row);
    }
  return 0;
}

/**
 ** @}
 **/


/**
 ** @addtogroup mat
 ** @{
 **/

/**
 ** Multiply matrices.
 ** This function multiplies @a A from the right by @a B and writes
 ** the result into @a dest.
 ** The matrices must be compatible for multiplication, i.e. they must be over
 ** the same field, and the number of columns of @a A must be equal to the
 ** number of rows of @a B.
 ** Moreover, it is assumed that @a dest is allocated in the right dimensions.
 ** Since parts of @a dest are used to store temporary results, it is essential
 ** that @a dest initially is zero!
 **
 ** @see matwin
 ** @param[out] dest Result.
 ** @param A Left factor.
 ** @param B Right factor
 ** @return Either @a dest, or NULL on error.
 **/
Matrix_t *MatMulStrassen(Matrix_t *dest, const Matrix_t *A, const Matrix_t *B)
{
  FfSetField(A->Field);
  MatrixWindow_t A_win[1], B_win[1], dest_win[1];
  FfSetNoc(A->Noc);
  MatrixToWindow(A_win, A, A->Nor, LPR, A->Data);
  FfSetNoc(B->Noc);
  MatrixToWindow(B_win, B, B->Nor, LPR, B->Data);
  FfSetNoc(dest->Noc);
  MatrixToWindow(dest_win, dest, A->Nor, LPR, dest->Data);
  if (StrassenStep(dest_win, A_win, B_win)) return NULL;
  return dest;
}

/**
 ** @}
 **/
