0512-drm-vc4-Fix-races-when-the-CS-reads-from-render-targ.patch 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. From 057da8ee92db7c8caece571aa20f478f5cae1318 Mon Sep 17 00:00:00 2001
  2. From: Eric Anholt <eric@anholt.net>
  3. Date: Tue, 27 Sep 2016 09:03:13 -0700
  4. Subject: [PATCH] drm/vc4: Fix races when the CS reads from render targets.
  5. With the introduction of bin/render pipelining, the previous job may
  6. not be completed when we start binning the next one. If the previous
  7. job wrote our VBO, IB, or CS textures, then the binning stage might
  8. get stale or uninitialized results.
  9. Fixes the major rendering failure in glmark2 -b terrain.
  10. Signed-off-by: Eric Anholt <eric@anholt.net>
  11. Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and rendering jobs")
  12. Cc: stable@vger.kernel.org
  13. ---
  14. drivers/gpu/drm/vc4/vc4_drv.h | 19 ++++++++++++++++++-
  15. drivers/gpu/drm/vc4/vc4_gem.c | 13 +++++++++++++
  16. drivers/gpu/drm/vc4/vc4_render_cl.c | 21 +++++++++++++++++----
  17. drivers/gpu/drm/vc4/vc4_validate.c | 17 ++++++++++++++---
  18. 4 files changed, 62 insertions(+), 8 deletions(-)
  19. --- a/drivers/gpu/drm/vc4/vc4_drv.h
  20. +++ b/drivers/gpu/drm/vc4/vc4_drv.h
  21. @@ -129,9 +129,16 @@ to_vc4_dev(struct drm_device *dev)
  22. struct vc4_bo {
  23. struct drm_gem_cma_object base;
  24. - /* seqno of the last job to render to this BO. */
  25. + /* seqno of the last job to render using this BO. */
  26. uint64_t seqno;
  27. + /* seqno of the last job to use the RCL to write to this BO.
  28. + *
  29. + * Note that this doesn't include binner overflow memory
  30. + * writes.
  31. + */
  32. + uint64_t write_seqno;
  33. +
  34. /* List entry for the BO's position in either
  35. * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
  36. */
  37. @@ -227,6 +234,9 @@ struct vc4_exec_info {
  38. /* Sequence number for this bin/render job. */
  39. uint64_t seqno;
  40. + /* Latest write_seqno of any BO that binning depends on. */
  41. + uint64_t bin_dep_seqno;
  42. +
  43. /* Last current addresses the hardware was processing when the
  44. * hangcheck timer checked on us.
  45. */
  46. @@ -241,6 +251,13 @@ struct vc4_exec_info {
  47. struct drm_gem_cma_object **bo;
  48. uint32_t bo_count;
  49. + /* List of BOs that are being written by the RCL. Other than
  50. + * the binner temporary storage, this is all the BOs written
  51. + * by the job.
  52. + */
  53. + struct drm_gem_cma_object *rcl_write_bo[4];
  54. + uint32_t rcl_write_bo_count;
  55. +
  56. /* Pointers for our position in vc4->job_list */
  57. struct list_head head;
  58. --- a/drivers/gpu/drm/vc4/vc4_gem.c
  59. +++ b/drivers/gpu/drm/vc4/vc4_gem.c
  60. @@ -483,6 +483,11 @@ vc4_update_bo_seqnos(struct vc4_exec_inf
  61. list_for_each_entry(bo, &exec->unref_list, unref_head) {
  62. bo->seqno = seqno;
  63. }
  64. +
  65. + for (i = 0; i < exec->rcl_write_bo_count; i++) {
  66. + bo = to_vc4_bo(&exec->rcl_write_bo[i]->base);
  67. + bo->write_seqno = seqno;
  68. + }
  69. }
  70. /* Queues a struct vc4_exec_info for execution. If no job is
  71. @@ -685,6 +690,14 @@ vc4_get_bcl(struct drm_device *dev, stru
  72. goto fail;
  73. ret = vc4_validate_shader_recs(dev, exec);
  74. + if (ret)
  75. + goto fail;
  76. +
  77. + /* Block waiting on any previous rendering into the CS's VBO,
  78. + * IB, or textures, so that pixels are actually written by the
  79. + * time we try to read them.
  80. + */
  81. + ret = vc4_wait_for_seqno(dev, exec->bin_dep_seqno, ~0ull, true);
  82. fail:
  83. kfree(temp);
  84. --- a/drivers/gpu/drm/vc4/vc4_render_cl.c
  85. +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
  86. @@ -45,6 +45,8 @@ struct vc4_rcl_setup {
  87. struct drm_gem_cma_object *rcl;
  88. u32 next_offset;
  89. +
  90. + u32 next_write_bo_index;
  91. };
  92. static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
  93. @@ -407,6 +409,8 @@ static int vc4_rcl_msaa_surface_setup(st
  94. if (!*obj)
  95. return -EINVAL;
  96. + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
  97. +
  98. if (surf->offset & 0xf) {
  99. DRM_ERROR("MSAA write must be 16b aligned.\n");
  100. return -EINVAL;
  101. @@ -417,7 +421,8 @@ static int vc4_rcl_msaa_surface_setup(st
  102. static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
  103. struct drm_gem_cma_object **obj,
  104. - struct drm_vc4_submit_rcl_surface *surf)
  105. + struct drm_vc4_submit_rcl_surface *surf,
  106. + bool is_write)
  107. {
  108. uint8_t tiling = VC4_GET_FIELD(surf->bits,
  109. VC4_LOADSTORE_TILE_BUFFER_TILING);
  110. @@ -440,6 +445,9 @@ static int vc4_rcl_surface_setup(struct
  111. if (!*obj)
  112. return -EINVAL;
  113. + if (is_write)
  114. + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
  115. +
  116. if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
  117. if (surf == &exec->args->zs_write) {
  118. DRM_ERROR("general zs write may not be a full-res.\n");
  119. @@ -542,6 +550,8 @@ vc4_rcl_render_config_surface_setup(stru
  120. if (!*obj)
  121. return -EINVAL;
  122. + exec->rcl_write_bo[exec->rcl_write_bo_count++] = *obj;
  123. +
  124. if (tiling > VC4_TILING_FORMAT_LT) {
  125. DRM_ERROR("Bad tiling format\n");
  126. return -EINVAL;
  127. @@ -599,15 +609,18 @@ int vc4_get_rcl(struct drm_device *dev,
  128. if (ret)
  129. return ret;
  130. - ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
  131. + ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read,
  132. + false);
  133. if (ret)
  134. return ret;
  135. - ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
  136. + ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read,
  137. + false);
  138. if (ret)
  139. return ret;
  140. - ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
  141. + ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write,
  142. + true);
  143. if (ret)
  144. return ret;
  145. --- a/drivers/gpu/drm/vc4/vc4_validate.c
  146. +++ b/drivers/gpu/drm/vc4/vc4_validate.c
  147. @@ -267,6 +267,9 @@ validate_indexed_prim_list(VALIDATE_ARGS
  148. if (!ib)
  149. return -EINVAL;
  150. + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
  151. + to_vc4_bo(&ib->base)->write_seqno);
  152. +
  153. if (offset > ib->base.size ||
  154. (ib->base.size - offset) / index_size < length) {
  155. DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
  156. @@ -555,8 +558,7 @@ static bool
  157. reloc_tex(struct vc4_exec_info *exec,
  158. void *uniform_data_u,
  159. struct vc4_texture_sample_info *sample,
  160. - uint32_t texture_handle_index)
  161. -
  162. + uint32_t texture_handle_index, bool is_cs)
  163. {
  164. struct drm_gem_cma_object *tex;
  165. uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
  166. @@ -714,6 +716,11 @@ reloc_tex(struct vc4_exec_info *exec,
  167. *validated_p0 = tex->paddr + p0;
  168. + if (is_cs) {
  169. + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
  170. + to_vc4_bo(&tex->base)->write_seqno);
  171. + }
  172. +
  173. return true;
  174. fail:
  175. DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
  176. @@ -835,7 +842,8 @@ validate_gl_shader_rec(struct drm_device
  177. if (!reloc_tex(exec,
  178. uniform_data_u,
  179. &validated_shader->texture_samples[tex],
  180. - texture_handles_u[tex])) {
  181. + texture_handles_u[tex],
  182. + i == 2)) {
  183. return -EINVAL;
  184. }
  185. }
  186. @@ -867,6 +875,9 @@ validate_gl_shader_rec(struct drm_device
  187. uint32_t stride = *(uint8_t *)(pkt_u + o + 5);
  188. uint32_t max_index;
  189. + exec->bin_dep_seqno = max(exec->bin_dep_seqno,
  190. + to_vc4_bo(&vbo->base)->write_seqno);
  191. +
  192. if (state->addr & 0x8)
  193. stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;