/*
 * Copyright © 2018 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/* blt command encoding for gen4/5 */
#include "crocus_context.h"

#include "crocus_genx_macros.h"
#include "crocus_genx_protos.h"
#include "crocus_resource.h"

#define FILE_DEBUG_FLAG DEBUG_BLIT

#if GFX_VER <= 5

static uint32_t
color_depth_for_cpp(int cpp)
{
   switch (cpp) {
   case 4: return COLOR_DEPTH__32bit;
   case 2: return COLOR_DEPTH__565;
   case 1: return COLOR_DEPTH__8bit;
   default:
      unreachable("not reached");
   }
}

static void
blt_set_alpha_to_one(struct crocus_batch *batch,
		     struct crocus_resource *dst,
		     int x, int y, int width, int height)
{
   const struct isl_format_layout *fmtl = isl_format_get_layout(dst->surf.format);
   unsigned cpp = fmtl->bpb / 8;
   uint32_t pitch = dst->surf.row_pitch_B;

   if (dst->surf.tiling != ISL_TILING_LINEAR)
      pitch /= 4;
   /* We need to split the blit into chunks that each fit within the blitter's
    * restrictions.  We can't use a chunk size of 32768 because we need to
    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
    * a nice round power of two, big enough that performance won't suffer, and
    * small enough to guarantee everything fits.
    */
   const uint32_t max_chunk_size = 16384;

   for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) {
      for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) {
         const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x);
         const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y);
         uint32_t tile_x, tile_y;
         uint64_t offset_B;
         ASSERTED uint32_t z_offset_el, array_offset;
         isl_tiling_get_intratile_offset_el(dst->surf.tiling, dst->surf.dim,
                                            dst->surf.msaa_layout,
                                            cpp * 8, dst->surf.samples,
                                            dst->surf.row_pitch_B,
                                            dst->surf.array_pitch_el_rows,
                                            chunk_x, chunk_y, 0, 0,
                                            &offset_B,
                                            &tile_x, &tile_y,
                                            &z_offset_el, &array_offset);
         assert(z_offset_el == 0);
         assert(array_offset == 0);
	 crocus_emit_cmd(batch, GENX(XY_COLOR_BLT), xyblt) {
            xyblt.TilingEnable = dst->surf.tiling != ISL_TILING_LINEAR;
            xyblt.ColorDepth = color_depth_for_cpp(cpp);
            xyblt.RasterOperation = 0xF0;
            xyblt.DestinationPitch = pitch;
            xyblt._32bppByteMask = 2;
            xyblt.DestinationBaseAddress = rw_bo(dst->bo, offset_B);
            xyblt.DestinationX1Coordinate = tile_x;
            xyblt.DestinationY1Coordinate = tile_y;
            xyblt.DestinationX2Coordinate = tile_x + chunk_w;
            xyblt.DestinationY2Coordinate = tile_y + chunk_h;
            xyblt.SolidPatternColor = 0xffffffff;
	 }
      }
   }
}

static bool validate_blit_for_blt(struct crocus_batch *batch,
                                  const struct pipe_blit_info *info)
{
   /* If the source and destination are the same size with no mirroring,
    * the rectangles are within the size of the texture and there is no
    * scissor, then we can probably use the blit engine.
    */
   if (info->dst.box.width != info->src.box.width ||
       info->dst.box.height != info->src.box.height)
      return false;

   if (info->scissor_enable)
      return false;

   if (info->dst.box.height < 0 || info->src.box.height < 0)
      return false;

   if (info->dst.box.depth > 1 || info->src.box.depth > 1)
      return false;

   const struct util_format_description *desc =
      util_format_description(info->src.format);
   int i = util_format_get_first_non_void_channel(info->src.format);
   if (i == -1)
      return false;

   /* can't do the alpha to 1 setting for these. */
   if ((util_format_has_alpha1(info->src.format) &&
        util_format_has_alpha(info->dst.format) &&
        desc->channel[i].size > 8))
      return false;
   return true;
}

static inline int crocus_resource_blt_pitch(struct crocus_resource *res)
{
   int pitch = res->surf.row_pitch_B;
   if (res->surf.tiling != ISL_TILING_LINEAR)
      pitch /= 4;
   return pitch;
}


static bool emit_copy_blt(struct crocus_batch *batch,
                          struct crocus_resource *src,
                          struct crocus_resource *dst,
                          unsigned cpp,
                          int32_t src_pitch,
                          unsigned src_offset,
                          int32_t dst_pitch,
                          unsigned dst_offset,
                          uint16_t src_x, uint16_t src_y,
                          uint16_t dst_x, uint16_t dst_y,
                          uint16_t w, uint16_t h)

{
   uint32_t src_tile_w, src_tile_h;
   uint32_t dst_tile_w, dst_tile_h;
   int dst_y2 = dst_y + h;
   int dst_x2 = dst_x + w;

   DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
       __func__,
       src, src_pitch, src_offset, src_x, src_y,
       dst, dst_pitch, dst_offset, dst_x, dst_y, w, h);

   isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h);
   isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h);

   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
    * (X direction width of the Tile). This is ensured while allocating the
    * buffer object.
    */
   assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
   assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);

   /* For big formats (such as floating point), do the copy using 16 or
    * 32bpp and multiply the coordinates.
    */
   if (cpp > 4) {
      if (cpp % 4 == 2) {
         dst_x *= cpp / 2;
         dst_x2 *= cpp / 2;
         src_x *= cpp / 2;
         cpp = 2;
      } else {
         assert(cpp % 4 == 0);
         dst_x *= cpp / 4;
         dst_x2 *= cpp / 4;
         src_x *= cpp / 4;
         cpp = 4;
      }
   }

   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
    * the low bits.  Offsets must be naturally aligned.
    */
   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
     return false;

   /* For tiled source and destination, pitch value should be specified
    * as a number of Dwords.
    */
   if (dst->surf.tiling != ISL_TILING_LINEAR)
      dst_pitch /= 4;

   if (src->surf.tiling != ISL_TILING_LINEAR)
      src_pitch /= 4;

   assert(cpp <= 4);
   crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) {
      xyblt.RasterOperation = 0xCC;
      xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR;
      xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR;
      xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset);
      xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset);
      xyblt.ColorDepth = color_depth_for_cpp(cpp);
      xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1;
      xyblt.DestinationX1Coordinate = dst_x;
      xyblt.DestinationY1Coordinate = dst_y;
      xyblt.DestinationX2Coordinate = dst_x2;
      xyblt.DestinationY2Coordinate = dst_y2;
      xyblt.DestinationPitch = dst_pitch;
      xyblt.SourceX1Coordinate = src_x;
      xyblt.SourceY1Coordinate = src_y;
      xyblt.SourcePitch = src_pitch;
   };

   crocus_emit_mi_flush(batch);
   return true;
}

static bool crocus_emit_blt(struct crocus_batch *batch,
                            struct crocus_resource *src,
                            struct crocus_resource *dst,
                            unsigned dst_level,
                            unsigned dst_x, unsigned dst_y,
                            unsigned dst_z,
                            unsigned src_level,
                            const struct pipe_box *src_box)
{
   const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format);
   unsigned src_cpp = src_fmtl->bpb / 8;
   const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format);
   const unsigned dst_cpp = dst_fmtl->bpb / 8;
   uint16_t src_x, src_y;
   uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
   uint32_t src_width = src_box->width, src_height = src_box->height;

   /* gen4/5 can't handle Y tiled blits. */
   if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0)
      return false;

   if (src->surf.format != dst->surf.format)
      return false;

   if (src_cpp != dst_cpp)
      return false;

   src_x = src_box->x;
   src_y = src_box->y;

   assert(src_cpp == dst_cpp);

   crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x,
                                    &src_image_y);
   if (util_format_is_compressed(src->base.b.format)) {
      int bw = util_format_get_blockwidth(src->base.b.format);
      int bh = util_format_get_blockheight(src->base.b.format);
      assert(src_x % bw == 0);
      assert(src_y % bh == 0);
      src_x /= (int)bw;
      src_y /= (int)bh;
      src_width = DIV_ROUND_UP(src_width, (int)bw);
      src_height = DIV_ROUND_UP(src_height, (int)bh);
   }

   crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x,
                                    &dst_image_y);
   if (util_format_is_compressed(dst->base.b.format)) {
      int bw = util_format_get_blockwidth(dst->base.b.format);
      int bh = util_format_get_blockheight(dst->base.b.format);
      assert(dst_x % bw == 0);
      assert(dst_y % bh == 0);
      dst_x /= (int)bw;
      dst_y /= (int)bh;
   }
   src_x += src_image_x;
   src_y += src_image_y;
   dst_x += dst_image_x;
   dst_y += dst_image_y;

   /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
    * Data Size Limitations):
    *
    *    The BLT engine is capable of transferring very large quantities of
    *    graphics data. Any graphics data read from and written to the
    *    destination is permitted to represent a number of pixels that
    *    occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
    *    at the destination. The maximum number of pixels that may be
    *    represented per scan line’s worth of graphics data depends on the
    *    color depth.
    *
    * The blitter's pitch is a signed 16-bit integer, but measured in bytes
    * for linear surfaces and DWords for tiled surfaces.  So the maximum
    * pitch is 32k linear and 128k tiled.
    */
   if (crocus_resource_blt_pitch(src) >= 32768 ||
       crocus_resource_blt_pitch(dst) >= 32768) {
      return false;
   }

   /* We need to split the blit into chunks that each fit within the blitter's
    * restrictions.  We can't use a chunk size of 32768 because we need to
    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
    * a nice round power of two, big enough that performance won't suffer, and
    * small enough to guarantee everything fits.
    */
   const uint32_t max_chunk_size = 16384;

   for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) {
      for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) {
         const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x);
         const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y);

         uint64_t src_offset;
         uint32_t src_tile_x, src_tile_y;
         ASSERTED uint32_t z_offset_el, array_offset;
         isl_tiling_get_intratile_offset_el(src->surf.tiling, src->surf.dim,
                                            src->surf.msaa_layout,
                                            src_cpp * 8, src->surf.samples,
                                            src->surf.row_pitch_B,
                                            src->surf.array_pitch_el_rows,
                                            src_x + chunk_x, src_y + chunk_y, 0, 0,
                                            &src_offset,
                                            &src_tile_x, &src_tile_y,
                                            &z_offset_el, &array_offset);
         assert(z_offset_el == 0);
         assert(array_offset == 0);

         uint64_t dst_offset;
         uint32_t dst_tile_x, dst_tile_y;
         isl_tiling_get_intratile_offset_el(dst->surf.tiling, dst->surf.dim,
                                            dst->surf.msaa_layout,
                                            dst_cpp * 8, dst->surf.samples,
                                            dst->surf.row_pitch_B,
                                            dst->surf.array_pitch_el_rows,
                                            dst_x + chunk_x, dst_y + chunk_y, 0, 0,
                                            &dst_offset,
                                            &dst_tile_x, &dst_tile_y,
                                            &z_offset_el, &array_offset);
         assert(z_offset_el == 0);
         assert(array_offset == 0);
         if (!emit_copy_blt(batch, src, dst,
                            src_cpp, src->surf.row_pitch_B,
                            src_offset,
                            dst->surf.row_pitch_B, dst_offset,
                            src_tile_x, src_tile_y,
                            dst_tile_x, dst_tile_y,
                            chunk_w, chunk_h)) {
            return false;
         }
      }
   }

   if (util_format_has_alpha1(src->base.b.format) &&
       util_format_has_alpha(dst->base.b.format))
      blt_set_alpha_to_one(batch, dst, 0, 0, src_width, src_height);
   return true;
}

static bool crocus_blit_blt(struct crocus_batch *batch,
                            const struct pipe_blit_info *info)
{
   if (!validate_blit_for_blt(batch, info))
      return false;

   return crocus_emit_blt(batch,
                          (struct crocus_resource *)info->src.resource,
                          (struct crocus_resource *)info->dst.resource,
                          info->dst.level,
                          info->dst.box.x,
                          info->dst.box.y,
                          info->dst.box.z,
                          info->src.level,
                          &info->src.box);
}


static bool crocus_copy_region_blt(struct crocus_batch *batch,
                                   struct crocus_resource *dst,
                                   unsigned dst_level,
                                   unsigned dstx, unsigned dsty, unsigned dstz,
                                   struct crocus_resource *src,
                                   unsigned src_level,
                                   const struct pipe_box *src_box)
{
   if (dst->base.b.target == PIPE_BUFFER || src->base.b.target == PIPE_BUFFER)
      return false;
   return crocus_emit_blt(batch,
                          src,
                          dst,
                          dst_level,
                          dstx, dsty, dstz,
                          src_level,
                          src_box);
}
#endif

void
genX(crocus_init_blt)(struct crocus_screen *screen)
{
#if GFX_VER <= 5
   screen->vtbl.blit_blt = crocus_blit_blt;
   screen->vtbl.copy_region_blt = crocus_copy_region_blt;
#else
   screen->vtbl.blit_blt = NULL;
   screen->vtbl.copy_region_blt = NULL;
#endif
}
