/*
 * Copyright © 2019 Red Hat.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "lvp_private.h"
#include "vk_util.h"
#include "glsl_types.h"
#include "spirv/nir_spirv.h"
#include "nir/nir_builder.h"
#include "lvp_lower_vulkan_resource.h"
#include "pipe/p_state.h"
#include "pipe/p_context.h"
#include "nir/nir_xfb_info.h"

#define SPIR_V_MAGIC_NUMBER 0x07230203

#define LVP_PIPELINE_DUP(dst, src, type, count) do {             \
      type *temp = ralloc_array(mem_ctx, type, count);           \
      if (!temp) return VK_ERROR_OUT_OF_HOST_MEMORY;             \
      memcpy(temp, (src), sizeof(type) * count);                 \
      dst = temp;                                                \
   } while(0)

VKAPI_ATTR void VKAPI_CALL lvp_DestroyPipeline(
   VkDevice                                    _device,
   VkPipeline                                  _pipeline,
   const VkAllocationCallbacks*                pAllocator)
{
   LVP_FROM_HANDLE(lvp_device, device, _device);
   LVP_FROM_HANDLE(lvp_pipeline, pipeline, _pipeline);

   if (!_pipeline)
      return;

   if (pipeline->shader_cso[PIPE_SHADER_VERTEX])
      device->queue.ctx->delete_vs_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]);
   if (pipeline->shader_cso[PIPE_SHADER_FRAGMENT])
      device->queue.ctx->delete_fs_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]);
   if (pipeline->shader_cso[PIPE_SHADER_GEOMETRY])
      device->queue.ctx->delete_gs_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]);
   if (pipeline->shader_cso[PIPE_SHADER_TESS_CTRL])
      device->queue.ctx->delete_tcs_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]);
   if (pipeline->shader_cso[PIPE_SHADER_TESS_EVAL])
      device->queue.ctx->delete_tes_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]);
   if (pipeline->shader_cso[PIPE_SHADER_COMPUTE])
      device->queue.ctx->delete_compute_state(device->queue.ctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]);

   ralloc_free(pipeline->mem_ctx);
   vk_object_base_finish(&pipeline->base);
   vk_free2(&device->vk.alloc, pAllocator, pipeline);
}

static VkResult
deep_copy_shader_stage(void *mem_ctx,
                       struct VkPipelineShaderStageCreateInfo *dst,
                       const struct VkPipelineShaderStageCreateInfo *src)
{
   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;
   dst->stage = src->stage;
   dst->module = src->module;
   dst->pName = src->pName;
   dst->pSpecializationInfo = NULL;
   if (src->pSpecializationInfo) {
      const VkSpecializationInfo *src_spec = src->pSpecializationInfo;
      VkSpecializationInfo *dst_spec = ralloc_size(mem_ctx, sizeof(VkSpecializationInfo) +
                                                   src_spec->mapEntryCount * sizeof(VkSpecializationMapEntry) +
                                                   src_spec->dataSize);
      VkSpecializationMapEntry *maps = (VkSpecializationMapEntry *)(dst_spec + 1);
      dst_spec->pMapEntries = maps;
      void *pdata = (void *)(dst_spec->pMapEntries + src_spec->mapEntryCount);
      dst_spec->pData = pdata;


      dst_spec->mapEntryCount = src_spec->mapEntryCount;
      dst_spec->dataSize = src_spec->dataSize;
      memcpy(pdata, src_spec->pData, src->pSpecializationInfo->dataSize);
      memcpy(maps, src_spec->pMapEntries, src_spec->mapEntryCount * sizeof(VkSpecializationMapEntry));
      dst->pSpecializationInfo = dst_spec;
   }
   return VK_SUCCESS;
}

static VkResult
deep_copy_vertex_input_state(void *mem_ctx,
                             struct VkPipelineVertexInputStateCreateInfo *dst,
                             const struct VkPipelineVertexInputStateCreateInfo *src)
{
   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;
   dst->vertexBindingDescriptionCount = src->vertexBindingDescriptionCount;

   LVP_PIPELINE_DUP(dst->pVertexBindingDescriptions,
                    src->pVertexBindingDescriptions,
                    VkVertexInputBindingDescription,
                    src->vertexBindingDescriptionCount);

   dst->vertexAttributeDescriptionCount = src->vertexAttributeDescriptionCount;

   LVP_PIPELINE_DUP(dst->pVertexAttributeDescriptions,
                    src->pVertexAttributeDescriptions,
                    VkVertexInputAttributeDescription,
                    src->vertexAttributeDescriptionCount);

   if (src->pNext) {
      vk_foreach_struct(ext, src->pNext) {
         switch (ext->sType) {
         case VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT: {
            VkPipelineVertexInputDivisorStateCreateInfoEXT *ext_src = (VkPipelineVertexInputDivisorStateCreateInfoEXT *)ext;
            VkPipelineVertexInputDivisorStateCreateInfoEXT *ext_dst = ralloc(mem_ctx, VkPipelineVertexInputDivisorStateCreateInfoEXT);

            ext_dst->sType = ext_src->sType;
            ext_dst->vertexBindingDivisorCount = ext_src->vertexBindingDivisorCount;

            LVP_PIPELINE_DUP(ext_dst->pVertexBindingDivisors,
                             ext_src->pVertexBindingDivisors,
                             VkVertexInputBindingDivisorDescriptionEXT,
                             ext_src->vertexBindingDivisorCount);

            dst->pNext = ext_dst;
            break;
         }
         default:
            break;
         }
      }
   }
   return VK_SUCCESS;
}

static bool
dynamic_state_contains(const VkPipelineDynamicStateCreateInfo *src, VkDynamicState state)
{
   if (!src)
      return false;

   for (unsigned i = 0; i < src->dynamicStateCount; i++)
      if (src->pDynamicStates[i] == state)
         return true;
   return false;
}

static VkResult
deep_copy_viewport_state(void *mem_ctx,
                         const VkPipelineDynamicStateCreateInfo *dyn_state,
                         VkPipelineViewportStateCreateInfo *dst,
                         const VkPipelineViewportStateCreateInfo *src)
{
   dst->sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
   dst->pNext = NULL;
   dst->pViewports = NULL;
   dst->pScissors = NULL;

   if (!dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_VIEWPORT) &&
       !dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT)) {
      LVP_PIPELINE_DUP(dst->pViewports,
                       src->pViewports,
                       VkViewport,
                       src->viewportCount);
   }
   if (!dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT))
      dst->viewportCount = src->viewportCount;
   else
      dst->viewportCount = 0;

   if (!dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_SCISSOR) &&
       !dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT)) {
      if (src->pScissors)
         LVP_PIPELINE_DUP(dst->pScissors,
                          src->pScissors,
                          VkRect2D,
                          src->scissorCount);
   }
   if (!dynamic_state_contains(dyn_state, VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT))
      dst->scissorCount = src->scissorCount;
   else
      dst->scissorCount = 0;

   return VK_SUCCESS;
}

static VkResult
deep_copy_color_blend_state(void *mem_ctx,
                            VkPipelineColorBlendStateCreateInfo *dst,
                            const VkPipelineColorBlendStateCreateInfo *src)
{
   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;
   dst->logicOpEnable = src->logicOpEnable;
   dst->logicOp = src->logicOp;

   LVP_PIPELINE_DUP(dst->pAttachments,
                    src->pAttachments,
                    VkPipelineColorBlendAttachmentState,
                    src->attachmentCount);
   dst->attachmentCount = src->attachmentCount;

   memcpy(&dst->blendConstants, &src->blendConstants, sizeof(float) * 4);

   return VK_SUCCESS;
}

static VkResult
deep_copy_dynamic_state(void *mem_ctx,
                        VkPipelineDynamicStateCreateInfo *dst,
                        const VkPipelineDynamicStateCreateInfo *src)
{
   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;

   LVP_PIPELINE_DUP(dst->pDynamicStates,
                    src->pDynamicStates,
                    VkDynamicState,
                    src->dynamicStateCount);
   dst->dynamicStateCount = src->dynamicStateCount;
   return VK_SUCCESS;
}


static VkResult
deep_copy_rasterization_state(void *mem_ctx,
                              VkPipelineRasterizationStateCreateInfo *dst,
                              const VkPipelineRasterizationStateCreateInfo *src)
{
   memcpy(dst, src, sizeof(VkPipelineRasterizationStateCreateInfo));
   dst->pNext = NULL;

   if (src->pNext) {
      vk_foreach_struct(ext, src->pNext) {
         switch (ext->sType) {
         case VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT: {
            VkPipelineRasterizationDepthClipStateCreateInfoEXT *ext_src = (VkPipelineRasterizationDepthClipStateCreateInfoEXT *)ext;
            VkPipelineRasterizationDepthClipStateCreateInfoEXT *ext_dst = ralloc(mem_ctx, VkPipelineRasterizationDepthClipStateCreateInfoEXT);
            ext_dst->sType = ext_src->sType;
            ext_dst->flags = ext_src->flags;
            ext_dst->depthClipEnable = ext_src->depthClipEnable;
            dst->pNext = ext_dst;
            break;
         }
         default:
            break;
         }
      }
   }
   return VK_SUCCESS;
}

static VkResult
deep_copy_graphics_create_info(void *mem_ctx,
                               VkGraphicsPipelineCreateInfo *dst,
                               const VkGraphicsPipelineCreateInfo *src)
{
   int i;
   VkResult result;
   VkPipelineShaderStageCreateInfo *stages;
   VkPipelineVertexInputStateCreateInfo *vertex_input;
   VkPipelineRasterizationStateCreateInfo *rasterization_state;
   LVP_FROM_HANDLE(lvp_render_pass, pass, src->renderPass);

   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;
   dst->layout = src->layout;
   dst->renderPass = src->renderPass;
   dst->subpass = src->subpass;
   dst->basePipelineHandle = src->basePipelineHandle;
   dst->basePipelineIndex = src->basePipelineIndex;

   /* pStages */
   VkShaderStageFlags stages_present = 0;
   dst->stageCount = src->stageCount;
   stages = ralloc_array(mem_ctx, VkPipelineShaderStageCreateInfo, dst->stageCount);
   for (i = 0 ; i < dst->stageCount; i++) {
      result = deep_copy_shader_stage(mem_ctx, &stages[i], &src->pStages[i]);
      if (result != VK_SUCCESS)
         return result;
      stages_present |= src->pStages[i].stage;
   }
   dst->pStages = stages;

   /* pVertexInputState */
   if (!dynamic_state_contains(src->pDynamicState, VK_DYNAMIC_STATE_VERTEX_INPUT_EXT)) {
      vertex_input = ralloc(mem_ctx, VkPipelineVertexInputStateCreateInfo);
      result = deep_copy_vertex_input_state(mem_ctx, vertex_input,
                                            src->pVertexInputState);
      if (result != VK_SUCCESS)
         return result;
      dst->pVertexInputState = vertex_input;
   } else
      dst->pVertexInputState = NULL;

   /* pInputAssemblyState */
   LVP_PIPELINE_DUP(dst->pInputAssemblyState,
                    src->pInputAssemblyState,
                    VkPipelineInputAssemblyStateCreateInfo,
                    1);

   /* pTessellationState */
   if (src->pTessellationState &&
      (stages_present & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) ==
                        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) {
      LVP_PIPELINE_DUP(dst->pTessellationState,
                       src->pTessellationState,
                       VkPipelineTessellationStateCreateInfo,
                       1);
   }

   /* pViewportState */
   bool rasterization_disabled = !dynamic_state_contains(src->pDynamicState, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) &&
                                 src->pRasterizationState->rasterizerDiscardEnable;
   if (src->pViewportState && !rasterization_disabled) {
      VkPipelineViewportStateCreateInfo *viewport_state;
      viewport_state = ralloc(mem_ctx, VkPipelineViewportStateCreateInfo);
      if (!viewport_state)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      deep_copy_viewport_state(mem_ctx, src->pDynamicState,
			       viewport_state, src->pViewportState);
      dst->pViewportState = viewport_state;
   } else
      dst->pViewportState = NULL;

   /* pRasterizationState */
   rasterization_state = ralloc(mem_ctx, VkPipelineRasterizationStateCreateInfo);
   if (!rasterization_state)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
   deep_copy_rasterization_state(mem_ctx, rasterization_state, src->pRasterizationState);
   dst->pRasterizationState = rasterization_state;

   /* pMultisampleState */
   if (src->pMultisampleState && !rasterization_disabled) {
      VkPipelineMultisampleStateCreateInfo*   ms_state;
      ms_state = ralloc_size(mem_ctx, sizeof(VkPipelineMultisampleStateCreateInfo) + sizeof(VkSampleMask));
      if (!ms_state)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      /* does samplemask need deep copy? */
      memcpy(ms_state, src->pMultisampleState, sizeof(VkPipelineMultisampleStateCreateInfo));
      if (src->pMultisampleState->pSampleMask) {
         VkSampleMask *sample_mask = (VkSampleMask *)(ms_state + 1);
         sample_mask[0] = src->pMultisampleState->pSampleMask[0];
         ms_state->pSampleMask = sample_mask;
      }
      dst->pMultisampleState = ms_state;
   } else
      dst->pMultisampleState = NULL;

   /* pDepthStencilState */
   if (src->pDepthStencilState && !rasterization_disabled && pass->has_zs_attachment) {
      LVP_PIPELINE_DUP(dst->pDepthStencilState,
                       src->pDepthStencilState,
                       VkPipelineDepthStencilStateCreateInfo,
                       1);
   } else
      dst->pDepthStencilState = NULL;

   /* pColorBlendState */
   if (src->pColorBlendState && !rasterization_disabled && pass->has_color_attachment) {
      VkPipelineColorBlendStateCreateInfo*    cb_state;

      cb_state = ralloc(mem_ctx, VkPipelineColorBlendStateCreateInfo);
      if (!cb_state)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      deep_copy_color_blend_state(mem_ctx, cb_state, src->pColorBlendState);
      dst->pColorBlendState = cb_state;
   } else
      dst->pColorBlendState = NULL;

   if (src->pDynamicState) {
      VkPipelineDynamicStateCreateInfo*       dyn_state;

      /* pDynamicState */
      dyn_state = ralloc(mem_ctx, VkPipelineDynamicStateCreateInfo);
      if (!dyn_state)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      deep_copy_dynamic_state(mem_ctx, dyn_state, src->pDynamicState);
      dst->pDynamicState = dyn_state;
   } else
      dst->pDynamicState = NULL;

   return VK_SUCCESS;
}

static VkResult
deep_copy_compute_create_info(void *mem_ctx,
                              VkComputePipelineCreateInfo *dst,
                              const VkComputePipelineCreateInfo *src)
{
   VkResult result;
   dst->sType = src->sType;
   dst->pNext = NULL;
   dst->flags = src->flags;
   dst->layout = src->layout;
   dst->basePipelineHandle = src->basePipelineHandle;
   dst->basePipelineIndex = src->basePipelineIndex;

   result = deep_copy_shader_stage(mem_ctx, &dst->stage, &src->stage);
   if (result != VK_SUCCESS)
      return result;
   return VK_SUCCESS;
}

static inline unsigned
st_shader_stage_to_ptarget(gl_shader_stage stage)
{
   switch (stage) {
   case MESA_SHADER_VERTEX:
      return PIPE_SHADER_VERTEX;
   case MESA_SHADER_FRAGMENT:
      return PIPE_SHADER_FRAGMENT;
   case MESA_SHADER_GEOMETRY:
      return PIPE_SHADER_GEOMETRY;
   case MESA_SHADER_TESS_CTRL:
      return PIPE_SHADER_TESS_CTRL;
   case MESA_SHADER_TESS_EVAL:
      return PIPE_SHADER_TESS_EVAL;
   case MESA_SHADER_COMPUTE:
      return PIPE_SHADER_COMPUTE;
   default:
      break;
   }

   assert(!"should not be reached");
   return PIPE_SHADER_VERTEX;
}

static void
shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
   assert(glsl_type_is_vector_or_scalar(type));

   uint32_t comp_size = glsl_type_is_boolean(type)
      ? 4 : glsl_get_bit_size(type) / 8;
   unsigned length = glsl_get_vector_elements(type);
   *size = comp_size * length,
      *align = comp_size;
}

static void
lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
                         struct vk_shader_module *module,
                         const char *entrypoint_name,
                         gl_shader_stage stage,
                         const VkSpecializationInfo *spec_info)
{
   nir_shader *nir;
   const nir_shader_compiler_options *drv_options = pipeline->device->pscreen->get_compiler_options(pipeline->device->pscreen, PIPE_SHADER_IR_NIR, st_shader_stage_to_ptarget(stage));
   bool progress;
   uint32_t *spirv = (uint32_t *) module->data;
   assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
   assert(module->size % 4 == 0);

   uint32_t num_spec_entries = 0;
   struct nir_spirv_specialization *spec_entries =
      vk_spec_info_to_nir_spirv(spec_info, &num_spec_entries);

   struct lvp_device *pdevice = pipeline->device;
   const struct spirv_to_nir_options spirv_options = {
      .environment = NIR_SPIRV_VULKAN,
      .caps = {
         .float64 = (pdevice->pscreen->get_param(pdevice->pscreen, PIPE_CAP_DOUBLES) == 1),
         .int16 = true,
         .int64 = (pdevice->pscreen->get_param(pdevice->pscreen, PIPE_CAP_INT64) == 1),
         .tessellation = true,
         .float_controls = true,
         .image_ms_array = true,
         .image_read_without_format = true,
         .image_write_without_format = true,
         .storage_image_ms = true,
         .geometry_streams = true,
         .storage_8bit = true,
         .storage_16bit = true,
         .variable_pointers = true,
         .stencil_export = true,
         .post_depth_coverage = true,
         .transform_feedback = true,
         .device_group = true,
         .draw_parameters = true,
         .shader_viewport_index_layer = true,
         .multiview = true,
         .physical_storage_buffer_address = true,
         .int64_atomics = true,
         .subgroup_arithmetic = true,
         .subgroup_basic = true,
         .subgroup_ballot = true,
         .subgroup_quad = true,
         .subgroup_vote = true,
         .int8 = true,
         .float16 = true,
      },
      .ubo_addr_format = nir_address_format_32bit_index_offset,
      .ssbo_addr_format = nir_address_format_32bit_index_offset,
      .phys_ssbo_addr_format = nir_address_format_64bit_global,
      .push_const_addr_format = nir_address_format_logical,
      .shared_addr_format = nir_address_format_32bit_offset,
   };

   nir = spirv_to_nir(spirv, module->size / 4,
                      spec_entries, num_spec_entries,
                      stage, entrypoint_name, &spirv_options, drv_options);

   if (!nir) {
      free(spec_entries);
      return;
   }
   nir_validate_shader(nir, NULL);

   free(spec_entries);

   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
      .frag_coord = true,
      .point_coord = true,
   };
   NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);

   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
   NIR_PASS_V(nir, nir_lower_returns);
   NIR_PASS_V(nir, nir_inline_functions);
   NIR_PASS_V(nir, nir_copy_prop);
   NIR_PASS_V(nir, nir_opt_deref);

   /* Pick off the single entrypoint that we want */
   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
      if (!func->is_entrypoint)
         exec_node_remove(&func->node);
   }
   assert(exec_list_length(&nir->functions) == 1);

   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
   NIR_PASS_V(nir, nir_split_var_copies);
   NIR_PASS_V(nir, nir_split_per_member_structs);

   NIR_PASS_V(nir, nir_remove_dead_variables,
              nir_var_shader_in | nir_var_shader_out | nir_var_system_value, NULL);

   if (stage == MESA_SHADER_FRAGMENT)
      lvp_lower_input_attachments(nir, false);
   NIR_PASS_V(nir, nir_lower_system_values);
   NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);

   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform, NULL);

   lvp_lower_pipeline_layout(pipeline->device, pipeline->layout, nir);

   NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true);
   NIR_PASS_V(nir, nir_split_var_copies);
   NIR_PASS_V(nir, nir_lower_global_vars_to_local);

   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
              nir_address_format_32bit_offset);

   NIR_PASS_V(nir, nir_lower_explicit_io,
              nir_var_mem_ubo | nir_var_mem_ssbo,
              nir_address_format_32bit_index_offset);

   NIR_PASS_V(nir, nir_lower_explicit_io,
              nir_var_mem_global,
              nir_address_format_64bit_global);

   if (nir->info.stage == MESA_SHADER_COMPUTE) {
      NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info);
      NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
   }

   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);

   if (nir->info.stage == MESA_SHADER_VERTEX ||
       nir->info.stage == MESA_SHADER_GEOMETRY) {
      NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, true);
   }

   do {
      progress = false;

      NIR_PASS(progress, nir, nir_lower_flrp, 32|64, true);
      NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
      NIR_PASS(progress, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
      NIR_PASS(progress, nir, nir_opt_deref);
      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);

      NIR_PASS(progress, nir, nir_copy_prop);
      NIR_PASS(progress, nir, nir_opt_dce);
      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);

      NIR_PASS(progress, nir, nir_opt_algebraic);
      NIR_PASS(progress, nir, nir_opt_constant_folding);

      NIR_PASS(progress, nir, nir_opt_remove_phis);
      bool trivial_continues = false;
      NIR_PASS(trivial_continues, nir, nir_opt_trivial_continues);
      progress |= trivial_continues;
      if (trivial_continues) {
         /* If nir_opt_trivial_continues makes progress, then we need to clean
          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
          * to make progress.
          */
         NIR_PASS(progress, nir, nir_copy_prop);
         NIR_PASS(progress, nir, nir_opt_dce);
         NIR_PASS(progress, nir, nir_opt_remove_phis);
      }
      NIR_PASS(progress, nir, nir_opt_if, true);
      NIR_PASS(progress, nir, nir_opt_dead_cf);
      NIR_PASS(progress, nir, nir_opt_conditional_discard);
      NIR_PASS(progress, nir, nir_opt_remove_phis);
      NIR_PASS(progress, nir, nir_opt_cse);
      NIR_PASS(progress, nir, nir_opt_undef);

      NIR_PASS(progress, nir, nir_opt_deref);
      NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL);
   } while (progress);

   NIR_PASS_V(nir, nir_lower_var_copies);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
   NIR_PASS_V(nir, nir_opt_dce);
   nir_sweep(nir);

   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));

   if (nir->info.stage != MESA_SHADER_VERTEX)
      nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
   else {
      nir->num_inputs = util_last_bit64(nir->info.inputs_read);
      nir_foreach_shader_in_variable(var, nir) {
         var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0;
      }
   }
   nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                               nir->info.stage);
   pipeline->pipeline_nir[stage] = nir;
}

static void fill_shader_prog(struct pipe_shader_state *state, gl_shader_stage stage, struct lvp_pipeline *pipeline)
{
   state->type = PIPE_SHADER_IR_NIR;
   state->ir.nir = pipeline->pipeline_nir[stage];
}

static void
merge_tess_info(struct shader_info *tes_info,
                const struct shader_info *tcs_info)
{
   /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
    *
    *    "PointMode. Controls generation of points rather than triangles
    *     or lines. This functionality defaults to disabled, and is
    *     enabled if either shader stage includes the execution mode.
    *
    * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
    * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
    * and OutputVertices, it says:
    *
    *    "One mode must be set in at least one of the tessellation
    *     shader stages."
    *
    * So, the fields can be set in either the TCS or TES, but they must
    * agree if set in both.  Our backend looks at TES, so bitwise-or in
    * the values from the TCS.
    */
   assert(tcs_info->tess.tcs_vertices_out == 0 ||
          tes_info->tess.tcs_vertices_out == 0 ||
          tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
   tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;

   assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
          tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
          tcs_info->tess.spacing == tes_info->tess.spacing);
   tes_info->tess.spacing |= tcs_info->tess.spacing;

   assert(tcs_info->tess.primitive_mode == 0 ||
          tes_info->tess.primitive_mode == 0 ||
          tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
   tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
   tes_info->tess.ccw |= tcs_info->tess.ccw;
   tes_info->tess.point_mode |= tcs_info->tess.point_mode;
}

static gl_shader_stage
lvp_shader_stage(VkShaderStageFlagBits stage)
{
   switch (stage) {
   case VK_SHADER_STAGE_VERTEX_BIT:
      return MESA_SHADER_VERTEX;
   case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
      return MESA_SHADER_TESS_CTRL;
   case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
      return MESA_SHADER_TESS_EVAL;
   case VK_SHADER_STAGE_GEOMETRY_BIT:
      return MESA_SHADER_GEOMETRY;
   case VK_SHADER_STAGE_FRAGMENT_BIT:
      return MESA_SHADER_FRAGMENT;
   case VK_SHADER_STAGE_COMPUTE_BIT:
      return MESA_SHADER_COMPUTE;
   default:
      unreachable("invalid VkShaderStageFlagBits");
      return MESA_SHADER_NONE;
   }
}

static VkResult
lvp_pipeline_compile(struct lvp_pipeline *pipeline,
                     gl_shader_stage stage)
{
   struct lvp_device *device = pipeline->device;
   device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, pipeline->pipeline_nir[stage]);
   if (stage == MESA_SHADER_COMPUTE) {
      struct pipe_compute_state shstate = {0};
      shstate.prog = (void *)pipeline->pipeline_nir[MESA_SHADER_COMPUTE];
      shstate.ir_type = PIPE_SHADER_IR_NIR;
      shstate.req_local_mem = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.shared_size;
      pipeline->shader_cso[PIPE_SHADER_COMPUTE] = device->queue.ctx->create_compute_state(device->queue.ctx, &shstate);
   } else {
      struct pipe_shader_state shstate = {0};
      fill_shader_prog(&shstate, stage, pipeline);

      if (stage == MESA_SHADER_VERTEX ||
          stage == MESA_SHADER_GEOMETRY ||
          stage == MESA_SHADER_TESS_EVAL) {
         nir_xfb_info *xfb_info = nir_gather_xfb_info(pipeline->pipeline_nir[stage], NULL);
         if (xfb_info) {
            uint8_t output_mapping[VARYING_SLOT_TESS_MAX];
            memset(output_mapping, 0, sizeof(output_mapping));

            nir_foreach_shader_out_variable(var, pipeline->pipeline_nir[stage]) {
               unsigned slots = var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
                                                  : glsl_count_attribute_slots(var->type, false);
               for (unsigned i = 0; i < slots; i++)
                  output_mapping[var->data.location + i] = var->data.driver_location + i;
            }

            shstate.stream_output.num_outputs = xfb_info->output_count;
            for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
               if (xfb_info->buffers_written & (1 << i)) {
                  shstate.stream_output.stride[i] = xfb_info->buffers[i].stride / 4;
               }
            }
            for (unsigned i = 0; i < xfb_info->output_count; i++) {
               shstate.stream_output.output[i].output_buffer = xfb_info->outputs[i].buffer;
               shstate.stream_output.output[i].dst_offset = xfb_info->outputs[i].offset / 4;
               shstate.stream_output.output[i].register_index = output_mapping[xfb_info->outputs[i].location];
               shstate.stream_output.output[i].num_components = util_bitcount(xfb_info->outputs[i].component_mask);
               shstate.stream_output.output[i].start_component = ffs(xfb_info->outputs[i].component_mask) - 1;
               shstate.stream_output.output[i].stream = xfb_info->buffer_to_stream[xfb_info->outputs[i].buffer];
            }

            ralloc_free(xfb_info);
         }
      }

      switch (stage) {
      case MESA_SHADER_FRAGMENT:
         pipeline->shader_cso[PIPE_SHADER_FRAGMENT] = device->queue.ctx->create_fs_state(device->queue.ctx, &shstate);
         break;
      case MESA_SHADER_VERTEX:
         pipeline->shader_cso[PIPE_SHADER_VERTEX] = device->queue.ctx->create_vs_state(device->queue.ctx, &shstate);
         break;
      case MESA_SHADER_GEOMETRY:
         pipeline->shader_cso[PIPE_SHADER_GEOMETRY] = device->queue.ctx->create_gs_state(device->queue.ctx, &shstate);
         break;
      case MESA_SHADER_TESS_CTRL:
         pipeline->shader_cso[PIPE_SHADER_TESS_CTRL] = device->queue.ctx->create_tcs_state(device->queue.ctx, &shstate);
         break;
      case MESA_SHADER_TESS_EVAL:
         pipeline->shader_cso[PIPE_SHADER_TESS_EVAL] = device->queue.ctx->create_tes_state(device->queue.ctx, &shstate);
         break;
      default:
         unreachable("illegal shader");
         break;
      }
   }
   return VK_SUCCESS;
}

static VkResult
lvp_graphics_pipeline_init(struct lvp_pipeline *pipeline,
                           struct lvp_device *device,
                           struct lvp_pipeline_cache *cache,
                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *alloc)
{
   if (alloc == NULL)
      alloc = &device->vk.alloc;
   pipeline->device = device;
   pipeline->layout = lvp_pipeline_layout_from_handle(pCreateInfo->layout);
   pipeline->force_min_sample = false;

   pipeline->mem_ctx = ralloc_context(NULL);
   /* recreate createinfo */
   deep_copy_graphics_create_info(pipeline->mem_ctx, &pipeline->graphics_create_info, pCreateInfo);
   pipeline->is_compute_pipeline = false;

   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_state =
      vk_find_struct_const(pCreateInfo->pRasterizationState,
                           PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
   pipeline->provoking_vertex_last = pv_state && pv_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;

   const VkPipelineRasterizationLineStateCreateInfoEXT *line_state =
      vk_find_struct_const(pCreateInfo->pRasterizationState,
                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
   if (line_state) {
      /* always draw bresenham if !smooth */
      pipeline->line_stipple_enable = line_state->stippledLineEnable;
      pipeline->line_smooth = line_state->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT;
      pipeline->disable_multisample = line_state->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT ||
                                      line_state->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT;
      pipeline->line_rectangular = line_state->lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
      if (pipeline->line_stipple_enable) {
         if (!dynamic_state_contains(pipeline->graphics_create_info.pDynamicState, VK_DYNAMIC_STATE_LINE_STIPPLE_EXT)) {
            pipeline->line_stipple_factor = line_state->lineStippleFactor - 1;
            pipeline->line_stipple_pattern = line_state->lineStipplePattern;
         } else {
            pipeline->line_stipple_factor = 0;
            pipeline->line_stipple_pattern = UINT16_MAX;
         }
      }
   } else
      pipeline->line_rectangular = true;

   bool rasterization_disabled = !dynamic_state_contains(pipeline->graphics_create_info.pDynamicState, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT) &&
      pipeline->graphics_create_info.pRasterizationState->rasterizerDiscardEnable;
   LVP_FROM_HANDLE(lvp_render_pass, pass, pipeline->graphics_create_info.renderPass);
   if (!dynamic_state_contains(pipeline->graphics_create_info.pDynamicState, VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT) &&
       !rasterization_disabled && pass->has_color_attachment) {
      const VkPipelineColorWriteCreateInfoEXT *cw_state =
         vk_find_struct_const(pCreateInfo->pColorBlendState, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
      if (cw_state) {
         for (unsigned i = 0; i < cw_state->attachmentCount; i++)
            if (!cw_state->pColorWriteEnables[i]) {
               VkPipelineColorBlendAttachmentState *att = (void*)&pipeline->graphics_create_info.pColorBlendState->pAttachments[i];
               att->colorWriteMask = 0;
            }
      }
   }


   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
      VK_FROM_HANDLE(vk_shader_module, module,
                      pCreateInfo->pStages[i].module);
      gl_shader_stage stage = lvp_shader_stage(pCreateInfo->pStages[i].stage);
      lvp_shader_compile_to_ir(pipeline, module,
                               pCreateInfo->pStages[i].pName,
                               stage,
                               pCreateInfo->pStages[i].pSpecializationInfo);
      if (!pipeline->pipeline_nir[stage])
         return VK_ERROR_FEATURE_NOT_PRESENT;
   }

   if (pipeline->pipeline_nir[MESA_SHADER_FRAGMENT]) {
      if (pipeline->pipeline_nir[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_qualifier ||
          BITSET_TEST(pipeline->pipeline_nir[MESA_SHADER_FRAGMENT]->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
          BITSET_TEST(pipeline->pipeline_nir[MESA_SHADER_FRAGMENT]->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS))
         pipeline->force_min_sample = true;
   }
   if (pipeline->pipeline_nir[MESA_SHADER_TESS_CTRL]) {
      nir_lower_patch_vertices(pipeline->pipeline_nir[MESA_SHADER_TESS_EVAL], pipeline->pipeline_nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
      merge_tess_info(&pipeline->pipeline_nir[MESA_SHADER_TESS_EVAL]->info, &pipeline->pipeline_nir[MESA_SHADER_TESS_CTRL]->info);
      const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
         vk_find_struct_const(pCreateInfo->pTessellationState,
                              PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
      if (!domain_origin_state || domain_origin_state->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
         pipeline->pipeline_nir[MESA_SHADER_TESS_EVAL]->info.tess.ccw = !pipeline->pipeline_nir[MESA_SHADER_TESS_EVAL]->info.tess.ccw;
   }

   pipeline->gs_output_lines = pipeline->pipeline_nir[MESA_SHADER_GEOMETRY] &&
                               pipeline->pipeline_nir[MESA_SHADER_GEOMETRY]->info.gs.output_primitive == GL_LINES;


   bool has_fragment_shader = false;
   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
      gl_shader_stage stage = lvp_shader_stage(pCreateInfo->pStages[i].stage);
      lvp_pipeline_compile(pipeline, stage);
      if (stage == MESA_SHADER_FRAGMENT)
         has_fragment_shader = true;
   }

   if (has_fragment_shader == false) {
      /* create a dummy fragment shader for this pipeline. */
      nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
                                                     "dummy_frag");

      pipeline->pipeline_nir[MESA_SHADER_FRAGMENT] = b.shader;
      struct pipe_shader_state shstate = {0};
      shstate.type = PIPE_SHADER_IR_NIR;
      shstate.ir.nir = pipeline->pipeline_nir[MESA_SHADER_FRAGMENT];
      pipeline->shader_cso[PIPE_SHADER_FRAGMENT] = device->queue.ctx->create_fs_state(device->queue.ctx, &shstate);
   }
   return VK_SUCCESS;
}

static VkResult
lvp_graphics_pipeline_create(
   VkDevice _device,
   VkPipelineCache _cache,
   const VkGraphicsPipelineCreateInfo *pCreateInfo,
   const VkAllocationCallbacks *pAllocator,
   VkPipeline *pPipeline)
{
   LVP_FROM_HANDLE(lvp_device, device, _device);
   LVP_FROM_HANDLE(lvp_pipeline_cache, cache, _cache);
   struct lvp_pipeline *pipeline;
   VkResult result;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);

   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pipeline == NULL)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   vk_object_base_init(&device->vk, &pipeline->base,
                       VK_OBJECT_TYPE_PIPELINE);
   result = lvp_graphics_pipeline_init(pipeline, device, cache, pCreateInfo,
                                       pAllocator);
   if (result != VK_SUCCESS) {
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
      return result;
   }

   *pPipeline = lvp_pipeline_to_handle(pipeline);

   return VK_SUCCESS;
}

VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateGraphicsPipelines(
   VkDevice                                    _device,
   VkPipelineCache                             pipelineCache,
   uint32_t                                    count,
   const VkGraphicsPipelineCreateInfo*         pCreateInfos,
   const VkAllocationCallbacks*                pAllocator,
   VkPipeline*                                 pPipelines)
{
   VkResult result = VK_SUCCESS;
   unsigned i = 0;

   for (; i < count; i++) {
      VkResult r;
      r = lvp_graphics_pipeline_create(_device,
                                       pipelineCache,
                                       &pCreateInfos[i],
                                       pAllocator, &pPipelines[i]);
      if (r != VK_SUCCESS) {
         result = r;
         pPipelines[i] = VK_NULL_HANDLE;
      }
   }

   return result;
}

static VkResult
lvp_compute_pipeline_init(struct lvp_pipeline *pipeline,
                          struct lvp_device *device,
                          struct lvp_pipeline_cache *cache,
                          const VkComputePipelineCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *alloc)
{
   VK_FROM_HANDLE(vk_shader_module, module,
                   pCreateInfo->stage.module);
   if (alloc == NULL)
      alloc = &device->vk.alloc;
   pipeline->device = device;
   pipeline->layout = lvp_pipeline_layout_from_handle(pCreateInfo->layout);
   pipeline->force_min_sample = false;

   pipeline->mem_ctx = ralloc_context(NULL);
   deep_copy_compute_create_info(pipeline->mem_ctx,
                                 &pipeline->compute_create_info, pCreateInfo);
   pipeline->is_compute_pipeline = true;

   lvp_shader_compile_to_ir(pipeline, module,
                            pCreateInfo->stage.pName,
                            MESA_SHADER_COMPUTE,
                            pCreateInfo->stage.pSpecializationInfo);
   if (!pipeline->pipeline_nir[MESA_SHADER_COMPUTE])
      return VK_ERROR_FEATURE_NOT_PRESENT;
   lvp_pipeline_compile(pipeline, MESA_SHADER_COMPUTE);
   return VK_SUCCESS;
}

static VkResult
lvp_compute_pipeline_create(
   VkDevice _device,
   VkPipelineCache _cache,
   const VkComputePipelineCreateInfo *pCreateInfo,
   const VkAllocationCallbacks *pAllocator,
   VkPipeline *pPipeline)
{
   LVP_FROM_HANDLE(lvp_device, device, _device);
   LVP_FROM_HANDLE(lvp_pipeline_cache, cache, _cache);
   struct lvp_pipeline *pipeline;
   VkResult result;

   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);

   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pipeline == NULL)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   vk_object_base_init(&device->vk, &pipeline->base,
                       VK_OBJECT_TYPE_PIPELINE);
   result = lvp_compute_pipeline_init(pipeline, device, cache, pCreateInfo,
                                      pAllocator);
   if (result != VK_SUCCESS) {
      vk_free2(&device->vk.alloc, pAllocator, pipeline);
      return result;
   }

   *pPipeline = lvp_pipeline_to_handle(pipeline);

   return VK_SUCCESS;
}

VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateComputePipelines(
   VkDevice                                    _device,
   VkPipelineCache                             pipelineCache,
   uint32_t                                    count,
   const VkComputePipelineCreateInfo*          pCreateInfos,
   const VkAllocationCallbacks*                pAllocator,
   VkPipeline*                                 pPipelines)
{
   VkResult result = VK_SUCCESS;
   unsigned i = 0;

   for (; i < count; i++) {
      VkResult r;
      r = lvp_compute_pipeline_create(_device,
                                      pipelineCache,
                                      &pCreateInfos[i],
                                      pAllocator, &pPipelines[i]);
      if (r != VK_SUCCESS) {
         result = r;
         pPipelines[i] = VK_NULL_HANDLE;
      }
   }

   return result;
}