Skip to content

Commit 4d0924a

Browse files
authored
Vulkan Phi Fix for AMD Proprietary Drivers (ggml-org#5260)
* Replace tanh to avoid NaN in gelu shader on AMD proprietary driver * Fix another Vulkan CPY buffer size bug
1 parent 8ca511c commit 4d0924a

File tree

3 files changed

+83
-69
lines changed

3 files changed

+83
-69
lines changed

ggml-vulkan-shaders.hpp

+69-63
Original file line numberDiff line numberDiff line change
@@ -14670,14 +14670,14 @@ const uint64_t f32_to_f16_fp32_len = 1596;
1467014670

1467114671
unsigned char gelu_f32_data[] = {
1467214672
0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
14673-
0x45,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
14673+
0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
1467414674
0x01,0x00,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
1467514675
0x47,0x4c,0x53,0x4c,0x2e,0x73,0x74,0x64,0x2e,0x34,0x35,0x30,
1467614676
0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
1467714677
0x01,0x00,0x00,0x00,0x0f,0x00,0x09,0x00,0x05,0x00,0x00,0x00,
1467814678
0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
1467914679
0x0b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
14680-
0x2c,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
14680+
0x38,0x00,0x00,0x00,0x10,0x00,0x06,0x00,0x04,0x00,0x00,0x00,
1468114681
0x11,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x01,0x00,0x00,0x00,
1468214682
0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x0b,0x00,0x00,0x00,
1468314683
0x0b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
@@ -14696,15 +14696,15 @@ unsigned char gelu_f32_data[] = {
1469614696
0x22,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
1469714697
0x24,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
1469814698
0x47,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
14699-
0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x29,0x00,0x00,0x00,
14699+
0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x35,0x00,0x00,0x00,
1470014700
0x06,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
14701-
0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
14702-
0x48,0x00,0x05,0x00,0x2a,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14701+
0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
14702+
0x48,0x00,0x05,0x00,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
1470314703
0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
14704-
0x2a,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
14705-
0x2c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14706-
0x47,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
14707-
0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
14704+
0x36,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
14705+
0x38,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14706+
0x47,0x00,0x04,0x00,0x38,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
14707+
0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x48,0x00,0x00,0x00,
1470814708
0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
1470914709
0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
1471014710
0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
@@ -14731,64 +14731,70 @@ unsigned char gelu_f32_data[] = {
1473114731
0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x22,0x00,0x00,0x00,
1473214732
0x3b,0x00,0x04,0x00,0x23,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
1473314733
0x0c,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x26,0x00,0x00,0x00,
14734-
0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,
14735-
0x29,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
14736-
0x2a,0x00,0x00,0x00,0x29,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
14737-
0x2b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
14738-
0x3b,0x00,0x04,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
14734+
0x0c,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
14735+
0x11,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,0x2a,0x42,0x4c,0x3f,
14736+
0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
14737+
0x00,0x00,0x80,0x3f,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
14738+
0x2e,0x00,0x00,0x00,0x13,0x27,0x37,0x3d,0x1d,0x00,0x03,0x00,
14739+
0x35,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
14740+
0x36,0x00,0x00,0x00,0x35,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
14741+
0x37,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x36,0x00,0x00,0x00,
14742+
0x3b,0x00,0x04,0x00,0x37,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
1473914743
0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
14740-
0x2e,0x00,0x00,0x00,0x00,0x00,0x00,0x3f,0x2b,0x00,0x04,0x00,
14741-
0x11,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x00,0x00,0x80,0x3f,
14742-
0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
14743-
0x2a,0x42,0x4c,0x3f,0x2b,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
14744-
0x35,0x00,0x00,0x00,0x13,0x27,0x37,0x3d,0x2b,0x00,0x04,0x00,
14745-
0x06,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x00,0x02,0x00,0x00,
14746-
0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
14747-
0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
14748-
0x42,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
14749-
0x41,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
14750-
0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
14751-
0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
14752-
0x43,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
14753-
0x0c,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
14754-
0x44,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
14755-
0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
14756-
0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
14757-
0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x17,0x00,0x00,0x00,
14758-
0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
14759-
0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
14760-
0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x1a,0x00,0x00,0x00,
14761-
0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
14762-
0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14763-
0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,0x1c,0x00,0x00,0x00,
14764-
0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x1c,0x00,0x00,0x00,
14765-
0xf9,0x00,0x02,0x00,0x43,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
14766-
0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,
14767-
0x27,0x00,0x00,0x00,0x24,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
14768-
0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x11,0x00,0x00,0x00,
14769-
0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
14744+
0x3a,0x00,0x00,0x00,0x00,0x00,0x00,0x3f,0x2b,0x00,0x04,0x00,
14745+
0x11,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x00,0x00,0x00,0x40,
14746+
0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
14747+
0x00,0x02,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
14748+
0x47,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
14749+
0x09,0x00,0x00,0x00,0x48,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
14750+
0x47,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x36,0x00,0x05,0x00,
14751+
0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14752+
0x03,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,
14753+
0xf7,0x00,0x03,0x00,0x49,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
14754+
0xfb,0x00,0x03,0x00,0x0c,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,
14755+
0xf8,0x00,0x02,0x00,0x4a,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
14756+
0x0d,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
14757+
0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
14758+
0x0f,0x00,0x00,0x00,0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
14759+
0x17,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0x14,0x00,0x00,0x00,
14760+
0x16,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
14761+
0x19,0x00,0x00,0x00,0x18,0x00,0x00,0x00,0xae,0x00,0x05,0x00,
14762+
0x1a,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
14763+
0x19,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,0x1d,0x00,0x00,0x00,
14764+
0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,0x1b,0x00,0x00,0x00,
14765+
0x1c,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
14766+
0x1c,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x49,0x00,0x00,0x00,
14767+
0xf8,0x00,0x02,0x00,0x1d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
14768+
0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x24,0x00,0x00,0x00,
14769+
0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
14770+
0x11,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
14771+
0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
14772+
0x2a,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
1477014773
0x11,0x00,0x00,0x00,0x30,0x00,0x00,0x00,0x2e,0x00,0x00,0x00,
14774+
0x28,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,0x11,0x00,0x00,0x00,
14775+
0x33,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
14776+
0x30,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
14777+
0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
14778+
0x2c,0x00,0x00,0x00,0x33,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
14779+
0x11,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
1477114780
0x28,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
14772-
0x34,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
14773-
0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x37,0x00,0x00,0x00,
14774-
0x35,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
14775-
0x11,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
14776-
0x32,0x00,0x00,0x00,0x37,0x00,0x00,0x00,0x28,0x00,0x00,0x00,
14777-
0x31,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
14778-
0x3b,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
14779-
0x0c,0x00,0x06,0x00,0x11,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
14780-
0x01,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x3b,0x00,0x00,0x00,
14781-
0x81,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
14782-
0x31,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
14783-
0x11,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
14784-
0x3d,0x00,0x00,0x00,0x41,0x00,0x06,0x00,0x26,0x00,0x00,0x00,
14785-
0x3f,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
14786-
0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,0x3f,0x00,0x00,0x00,
14787-
0x3e,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,0x43,0x00,0x00,0x00,
14788-
0xf8,0x00,0x02,0x00,0x43,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,
14789-
0x38,0x00,0x01,0x00,
14781+
0x3f,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
14782+
0x0c,0x00,0x06,0x00,0x11,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
14783+
0x01,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
14784+
0x81,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
14785+
0x40,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x88,0x00,0x05,0x00,
14786+
0x11,0x00,0x00,0x00,0x42,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,
14787+
0x41,0x00,0x00,0x00,0x83,0x00,0x05,0x00,0x11,0x00,0x00,0x00,
14788+
0x43,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
14789+
0x85,0x00,0x05,0x00,0x11,0x00,0x00,0x00,0x44,0x00,0x00,0x00,
14790+
0x3c,0x00,0x00,0x00,0x43,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
14791+
0x26,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
14792+
0x16,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
14793+
0x45,0x00,0x00,0x00,0x44,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
14794+
0x49,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x49,0x00,0x00,0x00,
14795+
0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
1479014796
};
14791-
const uint64_t gelu_f32_len = 1408;
14797+
const uint64_t gelu_f32_len = 1484;
1479214798

1479314799
unsigned char get_rows_f16_data[] = {
1479414800
0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,

ggml-vulkan.cpp

+12-5
Original file line numberDiff line numberDiff line change
@@ -2876,6 +2876,9 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
28762876
x_sz = ggml_nbytes(src0);
28772877
d_sz = ggml_nbytes(dst);
28782878

2879+
if (extra_src0->offset + x_sz >= d_X->size) {
2880+
x_sz = VK_WHOLE_SIZE;
2881+
}
28792882
if (extra->offset + d_sz >= d_D->size) {
28802883
d_sz = VK_WHOLE_SIZE;
28812884
}
@@ -2911,12 +2914,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
29112914
break;
29122915
}
29132916

2914-
x_sz *= ne02 * ne03;
2915-
if (y_sz != VK_WHOLE_SIZE) {
2916-
y_sz *= ne12 * ne13;
2917-
}
29182917
if (op != GGML_OP_CPY) {
2919-
d_sz *= ne02 * ne03;
2918+
if (x_sz != VK_WHOLE_SIZE) {
2919+
x_sz *= ne02 * ne03;
2920+
}
2921+
if (y_sz != VK_WHOLE_SIZE) {
2922+
y_sz *= ne12 * ne13;
2923+
}
2924+
if (d_sz != VK_WHOLE_SIZE) {
2925+
d_sz *= ne02 * ne03;
2926+
}
29202927
}
29212928

29222929
if (!use_src1 && op == GGML_OP_SOFT_MAX) {

ggml_vk_generate_shaders.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1689,7 +1689,8 @@
16891689
}
16901690
16911691
const float xi = float(data_a[i]);
1692-
data_d[i] = D_TYPE(0.5f*xi*(1.0f + tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))));
1692+
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
1693+
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
16931694
}
16941695
"""
16951696

0 commit comments

Comments
 (0)