/*
 * Copyright © 2006 - 2017 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "brw_compiler.h"
#include "brw_eu.h"

#include "dev/gen_debug.h"

struct brw_sf_compile {
   struct brw_codegen func;
   struct brw_sf_prog_key key;
   struct brw_sf_prog_data prog_data;

   struct brw_reg pv;
   struct brw_reg det;
   struct brw_reg dx0;
   struct brw_reg dx2;
   struct brw_reg dy0;
   struct brw_reg dy2;

   /* z and 1/w passed in seperately:
    */
   struct brw_reg z[3];
   struct brw_reg inv_w[3];

   /* The vertices:
    */
   struct brw_reg vert[3];

    /* Temporaries, allocated after last vertex reg.
    */
   struct brw_reg inv_det;
   struct brw_reg a1_sub_a0;
   struct brw_reg a2_sub_a0;
   struct brw_reg tmp;

   struct brw_reg m1Cx;
   struct brw_reg m2Cy;
   struct brw_reg m3C0;

   GLuint nr_verts;
   GLuint nr_attr_regs;
   GLuint nr_setup_regs;
   int urb_entry_read_offset;

   /** The last known value of the f0.0 flag register. */
   unsigned flag_value;

   struct brw_vue_map vue_map;
};

/**
 * Determine the vue slot corresponding to the given half of the given register.
 */
static inline int vert_reg_to_vue_slot(struct brw_sf_compile *c, GLuint reg,
                                       int half)
{
   return (reg + c->urb_entry_read_offset) * 2 + half;
}

/**
 * Determine the varying corresponding to the given half of the given
 * register.  half=0 means the first half of a register, half=1 means the
 * second half.
 */
static inline int vert_reg_to_varying(struct brw_sf_compile *c, GLuint reg,
                                      int half)
{
   int vue_slot = vert_reg_to_vue_slot(c, reg, half);
   return c->vue_map.slot_to_varying[vue_slot];
}

/**
 * Determine the register corresponding to the given vue slot
 */
static struct brw_reg get_vue_slot(struct brw_sf_compile *c,
                                   struct brw_reg vert,
                                   int vue_slot)
{
   GLuint off = vue_slot / 2 - c->urb_entry_read_offset;
   GLuint sub = vue_slot % 2;

   return brw_vec4_grf(vert.nr + off, sub * 4);
}

/**
 * Determine the register corresponding to the given varying.
 */
static struct brw_reg get_varying(struct brw_sf_compile *c,
                                  struct brw_reg vert,
                                  GLuint varying)
{
   int vue_slot = c->vue_map.varying_to_slot[varying];
   assert (vue_slot >= c->urb_entry_read_offset);
   return get_vue_slot(c, vert, vue_slot);
}

static bool
have_attr(struct brw_sf_compile *c, GLuint attr)
{
   return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
}

/***********************************************************************
 * Twoside lighting
 */
static void copy_bfc( struct brw_sf_compile *c,
		      struct brw_reg vert )
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   for (i = 0; i < 2; i++) {
      if (have_attr(c, VARYING_SLOT_COL0+i) &&
	  have_attr(c, VARYING_SLOT_BFC0+i))
	 brw_MOV(p,
		 get_varying(c, vert, VARYING_SLOT_COL0+i),
		 get_varying(c, vert, VARYING_SLOT_BFC0+i));
   }
}


static void do_twoside_color( struct brw_sf_compile *c )
{
   struct brw_codegen *p = &c->func;
   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;

   /* Already done in clip program:
    */
   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
      return;

   /* If the vertex shader provides backface color, do the selection. The VS
    * promises to set up the front color if the backface color is provided, but
    * it may contain junk if never written to.
    */
   if (!(have_attr(c, VARYING_SLOT_COL0) && have_attr(c, VARYING_SLOT_BFC0)) &&
       !(have_attr(c, VARYING_SLOT_COL1) && have_attr(c, VARYING_SLOT_BFC1)))
      return;

   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
    * to get all channels active inside the IF.  In the clipping code
    * we run with NoMask, so it's not an option and we can use
    * BRW_EXECUTE_1 for all comparisions.
    */
   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
   brw_IF(p, BRW_EXECUTE_4);
   {
      switch (c->nr_verts) {
      case 3: copy_bfc(c, c->vert[2]);
      case 2: copy_bfc(c, c->vert[1]);
      case 1: copy_bfc(c, c->vert[0]);
      }
   }
   brw_ENDIF(p);
}



/***********************************************************************
 * Flat shading
 */

static void copy_flatshaded_attributes(struct brw_sf_compile *c,
                                       struct brw_reg dst,
                                       struct brw_reg src)
{
   struct brw_codegen *p = &c->func;
   int i;

   for (i = 0; i < c->vue_map.num_slots; i++) {
      if (c->key.interp_mode[i] == INTERP_MODE_FLAT) {
         brw_MOV(p,
                 get_vue_slot(c, dst, i),
                 get_vue_slot(c, src, i));
      }
   }
}

static int count_flatshaded_attributes(struct brw_sf_compile *c)
{
   int i;
   int count = 0;

   for (i = 0; i < c->vue_map.num_slots; i++)
      if (c->key.interp_mode[i] == INTERP_MODE_FLAT)
         count++;

   return count;
}



/* Need to use a computed jump to copy flatshaded attributes as the
 * vertices are ordered according to y-coordinate before reaching this
 * point, so the PV could be anywhere.
 */
static void do_flatshade_triangle( struct brw_sf_compile *c )
{
   struct brw_codegen *p = &c->func;
   GLuint nr;
   GLuint jmpi = 1;

   /* Already done in clip program:
    */
   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
      return;

   if (p->devinfo->gen == 5)
       jmpi = 2;

   nr = count_flatshaded_attributes(c);

   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);

   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);
   copy_flatshaded_attributes(c, c->vert[2], c->vert[0]);
   brw_JMPI(p, brw_imm_d(jmpi*(nr*4+1)), BRW_PREDICATE_NONE);

   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
   copy_flatshaded_attributes(c, c->vert[2], c->vert[1]);
   brw_JMPI(p, brw_imm_d(jmpi*nr*2), BRW_PREDICATE_NONE);

   copy_flatshaded_attributes(c, c->vert[0], c->vert[2]);
   copy_flatshaded_attributes(c, c->vert[1], c->vert[2]);
}


static void do_flatshade_line( struct brw_sf_compile *c )
{
   struct brw_codegen *p = &c->func;
   GLuint nr;
   GLuint jmpi = 1;

   /* Already done in clip program:
    */
   if (c->key.primitive == BRW_SF_PRIM_UNFILLED_TRIS)
      return;

   if (p->devinfo->gen == 5)
       jmpi = 2;

   nr = count_flatshaded_attributes(c);

   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
   brw_JMPI(p, c->pv, BRW_PREDICATE_NONE);
   copy_flatshaded_attributes(c, c->vert[1], c->vert[0]);

   brw_JMPI(p, brw_imm_ud(jmpi*nr), BRW_PREDICATE_NONE);
   copy_flatshaded_attributes(c, c->vert[0], c->vert[1]);
}


/***********************************************************************
 * Triangle setup.
 */


static void alloc_regs( struct brw_sf_compile *c )
{
   GLuint reg, i;

   /* Values computed by fixed function unit:
    */
   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
   c->det = brw_vec1_grf(1, 2);
   c->dx0 = brw_vec1_grf(1, 3);
   c->dx2 = brw_vec1_grf(1, 4);
   c->dy0 = brw_vec1_grf(1, 5);
   c->dy2 = brw_vec1_grf(1, 6);

   /* z and 1/w passed in seperately:
    */
   c->z[0]     = brw_vec1_grf(2, 0);
   c->inv_w[0] = brw_vec1_grf(2, 1);
   c->z[1]     = brw_vec1_grf(2, 2);
   c->inv_w[1] = brw_vec1_grf(2, 3);
   c->z[2]     = brw_vec1_grf(2, 4);
   c->inv_w[2] = brw_vec1_grf(2, 5);

   /* The vertices:
    */
   reg = 3;
   for (i = 0; i < c->nr_verts; i++) {
      c->vert[i] = brw_vec8_grf(reg, 0);
      reg += c->nr_attr_regs;
   }

   /* Temporaries, allocated after last vertex reg.
    */
   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
   c->tmp = brw_vec8_grf(reg, 0);  reg++;

   /* Note grf allocation:
    */
   c->prog_data.total_grf = reg;


   /* Outputs of this program - interpolation coefficients for
    * rasterization:
    */
   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
}


static void copy_z_inv_w( struct brw_sf_compile *c )
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   /* Copy both scalars with a single MOV:
    */
   for (i = 0; i < c->nr_verts; i++)
      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
}


static void invert_det( struct brw_sf_compile *c)
{
   /* Looks like we invert all 8 elements just to get 1/det in
    * position 2 !?!
    */
   gen4_math(&c->func,
	     c->inv_det,
	     BRW_MATH_FUNCTION_INV,
	     0,
	     c->det,
	     BRW_MATH_PRECISION_FULL);

}


static bool
calculate_masks(struct brw_sf_compile *c,
                GLuint reg,
                GLushort *pc,
                GLushort *pc_persp,
                GLushort *pc_linear)
{
   bool is_last_attr = (reg == c->nr_setup_regs - 1);
   enum glsl_interp_mode interp;

   *pc_persp = 0;
   *pc_linear = 0;
   *pc = 0xf;

   interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 0)];
   if (interp == INTERP_MODE_SMOOTH) {
      *pc_linear = 0xf;
      *pc_persp = 0xf;
   } else if (interp == INTERP_MODE_NOPERSPECTIVE)
      *pc_linear = 0xf;

   /* Maybe only processs one attribute on the final round:
    */
   if (vert_reg_to_varying(c, reg, 1) != BRW_VARYING_SLOT_COUNT) {
      *pc |= 0xf0;

      interp = c->key.interp_mode[vert_reg_to_vue_slot(c, reg, 1)];
      if (interp == INTERP_MODE_SMOOTH) {
         *pc_linear |= 0xf0;
         *pc_persp |= 0xf0;
      } else if (interp == INTERP_MODE_NOPERSPECTIVE)
         *pc_linear |= 0xf0;
   }

   return is_last_attr;
}

/* Calculates the predicate control for which channels of a reg
 * (containing 2 attrs) to do point sprite coordinate replacement on.
 */
static uint16_t
calculate_point_sprite_mask(struct brw_sf_compile *c, GLuint reg)
{
   int varying1, varying2;
   uint16_t pc = 0;

   varying1 = vert_reg_to_varying(c, reg, 0);
   if (varying1 >= VARYING_SLOT_TEX0 && varying1 <= VARYING_SLOT_TEX7) {
      if (c->key.point_sprite_coord_replace & (1 << (varying1 - VARYING_SLOT_TEX0)))
	 pc |= 0x0f;
   }
   if (varying1 == BRW_VARYING_SLOT_PNTC)
      pc |= 0x0f;

   varying2 = vert_reg_to_varying(c, reg, 1);
   if (varying2 >= VARYING_SLOT_TEX0 && varying2 <= VARYING_SLOT_TEX7) {
      if (c->key.point_sprite_coord_replace & (1 << (varying2 -
                                                     VARYING_SLOT_TEX0)))
         pc |= 0xf0;
   }
   if (varying2 == BRW_VARYING_SLOT_PNTC)
      pc |= 0xf0;

   return pc;
}

static void
set_predicate_control_flag_value(struct brw_codegen *p,
                                 struct brw_sf_compile *c,
                                 unsigned value)
{
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);

   if (value != 0xff) {
      if (value != c->flag_value) {
         brw_MOV(p, brw_flag_reg(0, 0), brw_imm_uw(value));
         c->flag_value = value;
      }

      brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
   }
}

static void brw_emit_tri_setup(struct brw_sf_compile *c, bool allocate)
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   c->flag_value = 0xff;
   c->nr_verts = 3;

   if (allocate)
      alloc_regs(c);

   invert_det(c);
   copy_z_inv_w(c);

   if (c->key.do_twoside_color)
      do_twoside_color(c);

   if (c->key.contains_flat_varying)
      do_flatshade_triangle(c);


   for (i = 0; i < c->nr_setup_regs; i++)
   {
      /* Pair of incoming attributes:
       */
      struct brw_reg a0 = offset(c->vert[0], i);
      struct brw_reg a1 = offset(c->vert[1], i);
      struct brw_reg a2 = offset(c->vert[2], i);
      GLushort pc, pc_persp, pc_linear;
      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);

      if (pc_persp)
      {
	 set_predicate_control_flag_value(p, c, pc_persp);
	 brw_MUL(p, a0, a0, c->inv_w[0]);
	 brw_MUL(p, a1, a1, c->inv_w[1]);
	 brw_MUL(p, a2, a2, c->inv_w[2]);
      }


      /* Calculate coefficients for interpolated values:
       */
      if (pc_linear)
      {
	 set_predicate_control_flag_value(p, c, pc_linear);

	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));

	 /* calculate dA/dx
	  */
	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);

	 /* calculate dA/dy
	  */
	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
      }

      {
	 set_predicate_control_flag_value(p, c, pc);
	 /* start point for interpolation
	  */
	 brw_MOV(p, c->m3C0, a0);

	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
	  * the send instruction:
	  */
	 brw_urb_WRITE(p,
		       brw_null_reg(),
		       0,
		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
                       last ? BRW_URB_WRITE_EOT_COMPLETE
                       : BRW_URB_WRITE_NO_FLAGS,
		       4, 	/* msg len */
		       0,	/* response len */
		       i*4,	/* offset */
		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
      }
   }

   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}



static void brw_emit_line_setup(struct brw_sf_compile *c, bool allocate)
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   c->flag_value = 0xff;
   c->nr_verts = 2;

   if (allocate)
      alloc_regs(c);

   invert_det(c);
   copy_z_inv_w(c);

   if (c->key.contains_flat_varying)
      do_flatshade_line(c);

   for (i = 0; i < c->nr_setup_regs; i++)
   {
      /* Pair of incoming attributes:
       */
      struct brw_reg a0 = offset(c->vert[0], i);
      struct brw_reg a1 = offset(c->vert[1], i);
      GLushort pc, pc_persp, pc_linear;
      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);

      if (pc_persp)
      {
	 set_predicate_control_flag_value(p, c, pc_persp);
	 brw_MUL(p, a0, a0, c->inv_w[0]);
	 brw_MUL(p, a1, a1, c->inv_w[1]);
      }

      /* Calculate coefficients for position, color:
       */
      if (pc_linear) {
	 set_predicate_control_flag_value(p, c, pc_linear);

	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));

	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);

	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
      }

      {
	 set_predicate_control_flag_value(p, c, pc);

	 /* start point for interpolation
	  */
	 brw_MOV(p, c->m3C0, a0);

	 /* Copy m0..m3 to URB.
	  */
	 brw_urb_WRITE(p,
		       brw_null_reg(),
		       0,
		       brw_vec8_grf(0, 0),
                       last ? BRW_URB_WRITE_EOT_COMPLETE
                       : BRW_URB_WRITE_NO_FLAGS,
		       4, 	/* msg len */
		       0,	/* response len */
		       i*4,	/* urb destination offset */
		       BRW_URB_SWIZZLE_TRANSPOSE);
      }
   }

   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}

static void brw_emit_point_sprite_setup(struct brw_sf_compile *c, bool allocate)
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   c->flag_value = 0xff;
   c->nr_verts = 1;

   if (allocate)
      alloc_regs(c);

   copy_z_inv_w(c);
   for (i = 0; i < c->nr_setup_regs; i++)
   {
      struct brw_reg a0 = offset(c->vert[0], i);
      GLushort pc, pc_persp, pc_linear, pc_coord_replace;
      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);

      pc_coord_replace = calculate_point_sprite_mask(c, i);
      pc_persp &= ~pc_coord_replace;

      if (pc_persp) {
	 set_predicate_control_flag_value(p, c, pc_persp);
	 brw_MUL(p, a0, a0, c->inv_w[0]);
      }

      /* Point sprite coordinate replacement: A texcoord with this
       * enabled gets replaced with the value (x, y, 0, 1) where x and
       * y vary from 0 to 1 across the horizontal and vertical of the
       * point.
       */
      if (pc_coord_replace) {
	 set_predicate_control_flag_value(p, c, pc_coord_replace);
	 /* Caculate 1.0/PointWidth */
	 gen4_math(&c->func,
		   c->tmp,
		   BRW_MATH_FUNCTION_INV,
		   0,
		   c->dx0,
		   BRW_MATH_PRECISION_FULL);

	 brw_set_default_access_mode(p, BRW_ALIGN_16);

	 /* dA/dx, dA/dy */
	 brw_MOV(p, c->m1Cx, brw_imm_f(0.0));
	 brw_MOV(p, c->m2Cy, brw_imm_f(0.0));
	 brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp);
	 if (c->key.sprite_origin_lower_left) {
	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp));
	 } else {
	    brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp);
	 }

	 /* attribute constant offset */
	 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
	 if (c->key.sprite_origin_lower_left) {
	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0));
	 } else {
	    brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0));
	 }

	 brw_set_default_access_mode(p, BRW_ALIGN_1);
      }

      if (pc & ~pc_coord_replace) {
	 set_predicate_control_flag_value(p, c, pc & ~pc_coord_replace);
	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
	 brw_MOV(p, c->m3C0, a0); /* constant value */
      }


      set_predicate_control_flag_value(p, c, pc);
      /* Copy m0..m3 to URB. */
      brw_urb_WRITE(p,
		    brw_null_reg(),
		    0,
		    brw_vec8_grf(0, 0),
                    last ? BRW_URB_WRITE_EOT_COMPLETE
                    : BRW_URB_WRITE_NO_FLAGS,
		    4, 	/* msg len */
		    0,	/* response len */
		    i*4,	/* urb destination offset */
		    BRW_URB_SWIZZLE_TRANSPOSE);
   }

   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}

/* Points setup - several simplifications as all attributes are
 * constant across the face of the point (point sprites excluded!)
 */
static void brw_emit_point_setup(struct brw_sf_compile *c, bool allocate)
{
   struct brw_codegen *p = &c->func;
   GLuint i;

   c->flag_value = 0xff;
   c->nr_verts = 1;

   if (allocate)
      alloc_regs(c);

   copy_z_inv_w(c);

   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */

   for (i = 0; i < c->nr_setup_regs; i++)
   {
      struct brw_reg a0 = offset(c->vert[0], i);
      GLushort pc, pc_persp, pc_linear;
      bool last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);

      if (pc_persp)
      {
	 /* This seems odd as the values are all constant, but the
	  * fragment shader will be expecting it:
	  */
	 set_predicate_control_flag_value(p, c, pc_persp);
	 brw_MUL(p, a0, a0, c->inv_w[0]);
      }


      /* The delta values are always zero, just send the starting
       * coordinate.  Again, this is to fit in with the interpolation
       * code in the fragment shader.
       */
      {
	 set_predicate_control_flag_value(p, c, pc);

	 brw_MOV(p, c->m3C0, a0); /* constant value */

	 /* Copy m0..m3 to URB.
	  */
	 brw_urb_WRITE(p,
		       brw_null_reg(),
		       0,
		       brw_vec8_grf(0, 0),
                       last ? BRW_URB_WRITE_EOT_COMPLETE
                       : BRW_URB_WRITE_NO_FLAGS,
		       4, 	/* msg len */
		       0,	/* response len */
		       i*4,	/* urb destination offset */
		       BRW_URB_SWIZZLE_TRANSPOSE);
      }
   }

   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
}

static void brw_emit_anyprim_setup( struct brw_sf_compile *c )
{
   struct brw_codegen *p = &c->func;
   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0);
   struct brw_reg primmask;
   int jmp;
   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));

   c->nr_verts = 3;
   alloc_regs(c);

   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);

   brw_MOV(p, primmask, brw_imm_ud(1));
   brw_SHL(p, primmask, primmask, payload_prim);

   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
					       (1<<_3DPRIM_TRISTRIP) |
					       (1<<_3DPRIM_TRIFAN) |
					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
					       (1<<_3DPRIM_POLYGON) |
					       (1<<_3DPRIM_RECTLIST) |
					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_tri_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
					       (1<<_3DPRIM_LINESTRIP) |
					       (1<<_3DPRIM_LINELOOP) |
					       (1<<_3DPRIM_LINESTRIP_CONT) |
					       (1<<_3DPRIM_LINESTRIP_BF) |
					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_line_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
   jmp = brw_JMPI(p, brw_imm_d(0), BRW_PREDICATE_NORMAL) - p->store;
   brw_emit_point_sprite_setup(c, false);
   brw_land_fwd_jump(p, jmp);

   brw_emit_point_setup( c, false );
}

const unsigned *
brw_compile_sf(const struct brw_compiler *compiler,
               void *mem_ctx,
               const struct brw_sf_prog_key *key,
               struct brw_sf_prog_data *prog_data,
               struct brw_vue_map *vue_map,
               unsigned *final_assembly_size)
{
   struct brw_sf_compile c;
   memset(&c, 0, sizeof(c));

   /* Begin the compilation:
    */
   brw_init_codegen(compiler->devinfo, &c.func, mem_ctx);

   c.key = *key;
   c.vue_map = *vue_map;
   if (c.key.do_point_coord) {
      /*
       * gl_PointCoord is a FS instead of VS builtin variable, thus it's
       * not included in c.vue_map generated in VS stage. Here we add
       * it manually to let SF shader generate the needed interpolation
       * coefficient for FS shader.
       */
      c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots;
      c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC;
   }
   c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
   c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset;
   c.nr_setup_regs = c.nr_attr_regs;

   c.prog_data.urb_read_length = c.nr_attr_regs;
   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;

   /* Which primitive?  Or all three?
    */
   switch (key->primitive) {
   case BRW_SF_PRIM_TRIANGLES:
      c.nr_verts = 3;
      brw_emit_tri_setup( &c, true );
      break;
   case BRW_SF_PRIM_LINES:
      c.nr_verts = 2;
      brw_emit_line_setup( &c, true );
      break;
   case BRW_SF_PRIM_POINTS:
      c.nr_verts = 1;
      if (key->do_point_sprite)
	  brw_emit_point_sprite_setup( &c, true );
      else
	  brw_emit_point_setup( &c, true );
      break;
   case BRW_SF_PRIM_UNFILLED_TRIS:
      c.nr_verts = 3;
      brw_emit_anyprim_setup( &c );
      break;
   default:
      unreachable("not reached");
   }

   /* FINISHME: SF programs use calculated jumps (i.e., JMPI with a register
    * source). Compacting would be difficult.
    */
   /* brw_compact_instructions(&c.func, 0, 0, NULL); */

   *prog_data = c.prog_data;

   const unsigned *program = brw_get_program(&c.func, final_assembly_size);

   if (unlikely(INTEL_DEBUG & DEBUG_SF)) {
      fprintf(stderr, "sf:\n");
      brw_disassemble(compiler->devinfo,
                      program, 0, *final_assembly_size, stderr);
      fprintf(stderr, "\n");
   }

   return program;
}
