mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 23:09:53 +00:00
vulkan : argsort barriers must be under uniform control flow (ggml/951)
a return before a barrier (that happens only in some threads in a workgroup) leads to UB. While the old code actually works on some devices, it fails on some others (i.e. "smaller" GPUs). BTW, I think it would be better to set specialization constants when the graph is built, in that way the local workgroup could be sized appropriately. But it would take a lot of work. Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
This commit is contained in:
parent
6084bfb261
commit
544f409b4b
@ -29,20 +29,18 @@ void main() {
|
|||||||
const int col = int(gl_LocalInvocationID.x);
|
const int col = int(gl_LocalInvocationID.x);
|
||||||
const uint row = gl_WorkGroupID.y;
|
const uint row = gl_WorkGroupID.y;
|
||||||
|
|
||||||
if (col >= p.ncols_pad) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint row_offset = row * p.ncols;
|
const uint row_offset = row * p.ncols;
|
||||||
|
|
||||||
// initialize indices
|
// initialize indices
|
||||||
dst_row[col] = col;
|
if (col < p.ncols_pad) {
|
||||||
|
dst_row[col] = col;
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
for (uint k = 2; k <= p.ncols_pad; k *= 2) {
|
for (uint k = 2; k <= p.ncols_pad; k *= 2) {
|
||||||
for (uint j = k / 2; j > 0; j /= 2) {
|
for (uint j = k / 2; j > 0; j /= 2) {
|
||||||
const uint ixj = col ^ j;
|
const uint ixj = col ^ j;
|
||||||
if (ixj > col) {
|
if (col < p.ncols_pad && ixj > col) {
|
||||||
if ((col & k) == 0) {
|
if ((col & k) == 0) {
|
||||||
if (dst_row[col] >= p.ncols ||
|
if (dst_row[col] >= p.ncols ||
|
||||||
(dst_row[ixj] < p.ncols && (p.order == ASC ?
|
(dst_row[ixj] < p.ncols && (p.order == ASC ?
|
||||||
|
Loading…
Reference in New Issue
Block a user