@@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
3219
3219
/*.pad =*/ { 0 },
3220
3220
};
3221
3221
3222
- ggml_assert_aligned (result -> data );
3222
+ // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3223
+ //ggml_assert_aligned(result->data);
3223
3224
3224
3225
for (int i = 0 ; i < n_dims ; i ++ ) {
3225
3226
result -> ne [i ] = ne [i ];
@@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
3620
3621
struct ggml_tensor * ggml_view_tensor (
3621
3622
struct ggml_context * ctx ,
3622
3623
const struct ggml_tensor * src ) {
3623
- return ggml_new_tensor_impl (ctx , src -> type , src -> n_dims , src -> ne , src -> data );
3624
+ struct ggml_tensor * result = ggml_new_tensor_impl (ctx , src -> type , src -> n_dims , src -> ne , src -> data );
3625
+
3626
+ result -> nb [0 ] = src -> nb [0 ];
3627
+ result -> nb [1 ] = src -> nb [1 ];
3628
+ result -> nb [2 ] = src -> nb [2 ];
3629
+ result -> nb [3 ] = src -> nb [3 ];
3630
+
3631
+ return result ;
3624
3632
}
3625
3633
3626
3634
////////////////////////////////////////////////////////////////////////////////
@@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
4510
4518
return result ;
4511
4519
}
4512
4520
4521
+ // ggml_view_3d
4522
+
4523
+ struct ggml_tensor * ggml_view_3d (
4524
+ struct ggml_context * ctx ,
4525
+ struct ggml_tensor * a ,
4526
+ int64_t ne0 ,
4527
+ int64_t ne1 ,
4528
+ int64_t ne2 ,
4529
+ size_t nb1 ,
4530
+ size_t nb2 ,
4531
+ size_t offset ) {
4532
+ if (a -> grad ) {
4533
+ GGML_ASSERT (false); // gradient propagation is not supported
4534
+ }
4535
+
4536
+ const int64_t ne [GGML_MAX_DIMS ] = { ne0 , ne1 , ne2 , 1 };
4537
+
4538
+ struct ggml_tensor * result = ggml_new_tensor_impl (ctx , a -> type , 3 , ne , (char * ) a -> data + offset );
4539
+
4540
+ result -> nb [1 ] = nb1 ;
4541
+ result -> nb [2 ] = nb2 ;
4542
+ result -> nb [3 ] = result -> nb [2 ]* ne2 ;
4543
+
4544
+ result -> op = GGML_OP_VIEW ;
4545
+ result -> grad = NULL ;
4546
+ result -> src0 = a ;
4547
+ result -> src1 = NULL ; // TODO: maybe store the offset here?
4548
+
4549
+ return result ;
4550
+ }
4551
+
4513
4552
// ggml_permute
4514
4553
4515
4554
struct ggml_tensor * ggml_permute (
@@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16(
4845
4884
const struct ggml_tensor * src0 ,
4846
4885
struct ggml_tensor * dst ) {
4847
4886
GGML_ASSERT (params -> ith == 0 );
4848
- GGML_ASSERT (ggml_is_contiguous (dst ));
4849
4887
GGML_ASSERT (ggml_nelements (dst ) == ggml_nelements (src0 ));
4850
4888
4851
4889
if (params -> type == GGML_TASK_INIT || params -> type == GGML_TASK_FINALIZE ) {
@@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16(
4862
4900
const size_t nb02 = src0 -> nb [2 ];
4863
4901
const size_t nb03 = src0 -> nb [3 ];
4864
4902
4865
- if (ggml_is_contiguous (src0 ) && src0 -> type == dst -> type ) {
4903
+ const size_t nb0 = dst -> nb [0 ];
4904
+ const size_t nb1 = dst -> nb [1 ];
4905
+ const size_t nb2 = dst -> nb [2 ];
4906
+ const size_t nb3 = dst -> nb [3 ];
4907
+
4908
+ if (ggml_is_contiguous (src0 ) && ggml_is_contiguous (dst ) && src0 -> type == dst -> type ) {
4866
4909
memcpy (dst -> data , src0 -> data , ggml_nelements (dst ) * GGML_TYPE_SIZE [src0 -> type ]);
4867
4910
return ;
4868
4911
}
4869
4912
4870
- if (src0 -> nb [0 ] == sizeof (ggml_fp16_t )) {
4871
- if (dst -> type == GGML_TYPE_F16 ) {
4872
- size_t id = 0 ;
4873
- const size_t rs = ne00 * nb00 ;
4874
-
4875
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4876
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4877
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4878
- const char * src0_ptr = (char * ) src0 -> data + i01 * nb01 + i02 * nb02 + i03 * nb03 ;
4879
- char * dst_ptr = (char * ) dst -> data + id * rs ;
4880
-
4881
- memcpy (dst_ptr , src0_ptr , rs );
4882
-
4883
- id ++ ;
4884
- }
4885
- }
4886
- }
4887
- } else if (dst -> type == GGML_TYPE_F32 ) {
4888
- size_t id = 0 ;
4889
- float * dst_ptr = (float * ) dst -> data ;
4890
-
4891
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4892
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4893
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4894
- for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4895
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t * ) ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
4896
-
4897
- dst_ptr [id ] = GGML_FP16_TO_FP32 (* src0_ptr );
4898
- id ++ ;
4899
- }
4900
- }
4913
+ if (src0 -> type == dst -> type &&
4914
+ src0 -> ne [0 ] == dst -> ne [0 ] &&
4915
+ src0 -> nb [0 ] == GGML_TYPE_SIZE [src0 -> type ] && dst -> nb [0 ] == GGML_TYPE_SIZE [dst -> type ]) {
4916
+ // copy by rows
4917
+ const size_t rs = ne00 * nb00 ;
4918
+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4919
+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4920
+ for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4921
+ memcpy (
4922
+ ((char * ) dst -> data + i01 * nb1 + i02 * nb2 + i03 * nb3 ),
4923
+ ((char * ) src0 -> data + i01 * nb01 + i02 * nb02 + i03 * nb03 ),
4924
+ rs );
4901
4925
}
4902
4926
}
4903
- } else {
4904
- GGML_ASSERT (false); // TODO: implement
4905
4927
}
4906
- } else {
4907
- //printf("%s: this is not optimal - fix me\n", __func__);
4928
+ return ;
4929
+ }
4908
4930
4909
- if (dst -> type == GGML_TYPE_F32 ) {
4910
- size_t id = 0 ;
4911
- float * dst_ptr = (float * ) dst -> data ;
4931
+ // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
4912
4932
4913
- for ( int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4914
- for ( int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4915
- for ( int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4916
- for ( int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4917
- const ggml_fp16_t * src0_ptr = ( ggml_fp16_t * ) (( char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 ) ;
4933
+ // dst counters
4934
+ int64_t i10 = 0 ;
4935
+ int64_t i11 = 0 ;
4936
+ int64_t i12 = 0 ;
4937
+ int64_t i13 = 0 ;
4918
4938
4919
- dst_ptr [id ] = GGML_FP16_TO_FP32 (* src0_ptr );
4920
- id ++ ;
4939
+ if (dst -> type == GGML_TYPE_F16 ) {
4940
+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4941
+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4942
+ for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4943
+ for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4944
+ const char * src0_ptr = ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
4945
+ char * dst_ptr = ((char * ) dst -> data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3 );
4946
+
4947
+ memcpy (dst_ptr , src0_ptr , sizeof (ggml_fp16_t ));
4948
+
4949
+ if (++ i10 == ne00 ) {
4950
+ i10 = 0 ;
4951
+ if (++ i11 == ne01 ) {
4952
+ i11 = 0 ;
4953
+ if (++ i12 == ne02 ) {
4954
+ i12 = 0 ;
4955
+ if (++ i13 == ne03 ) {
4956
+ i13 = 0 ;
4957
+ }
4958
+ }
4959
+ }
4921
4960
}
4922
4961
}
4923
4962
}
4924
4963
}
4925
- } else if (dst -> type == GGML_TYPE_F16 ) {
4926
- size_t id = 0 ;
4927
- ggml_fp16_t * dst_ptr = (ggml_fp16_t * ) dst -> data ;
4928
-
4929
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4930
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4931
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4932
- for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4933
- const ggml_fp16_t * src0_ptr = (ggml_fp16_t * ) ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
4934
-
4935
- dst_ptr [id ] = * src0_ptr ;
4936
- id ++ ;
4964
+ }
4965
+ } else if (dst -> type == GGML_TYPE_F32 ) {
4966
+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4967
+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4968
+ for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4969
+ for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4970
+ const char * src0_ptr = ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
4971
+ char * dst_ptr = ((char * ) dst -> data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3 );
4972
+
4973
+ * (float * ) dst_ptr = GGML_FP16_TO_FP32 (* (const ggml_fp16_t * ) src0_ptr );
4974
+
4975
+ if (++ i10 == ne00 ) {
4976
+ i10 = 0 ;
4977
+ if (++ i11 == ne01 ) {
4978
+ i11 = 0 ;
4979
+ if (++ i12 == ne02 ) {
4980
+ i12 = 0 ;
4981
+ if (++ i13 == ne03 ) {
4982
+ i13 = 0 ;
4983
+ }
4984
+ }
4985
+ }
4937
4986
}
4938
4987
}
4939
4988
}
4940
4989
}
4941
- } else {
4942
- GGML_ASSERT (false); // TODO: implement
4943
4990
}
4991
+ } else {
4992
+ GGML_ASSERT (false); // TODO: implement
4944
4993
}
4945
4994
}
4946
4995
@@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32(
4949
4998
const struct ggml_tensor * src0 ,
4950
4999
struct ggml_tensor * dst ) {
4951
5000
GGML_ASSERT (params -> ith == 0 );
4952
- GGML_ASSERT (ggml_is_contiguous (dst ));
4953
5001
GGML_ASSERT (ggml_nelements (dst ) == ggml_nelements (src0 ));
4954
5002
4955
5003
if (params -> type == GGML_TASK_INIT || params -> type == GGML_TASK_FINALIZE ) {
@@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32(
4966
5014
const size_t nb02 = src0 -> nb [2 ];
4967
5015
const size_t nb03 = src0 -> nb [3 ];
4968
5016
4969
- if (ggml_is_contiguous (src0 ) && src0 -> type == dst -> type ) {
5017
+ const size_t nb0 = dst -> nb [0 ];
5018
+ const size_t nb1 = dst -> nb [1 ];
5019
+ const size_t nb2 = dst -> nb [2 ];
5020
+ const size_t nb3 = dst -> nb [3 ];
5021
+
5022
+ if (ggml_is_contiguous (src0 ) && ggml_is_contiguous (dst ) && src0 -> type == dst -> type ) {
4970
5023
memcpy (dst -> data , src0 -> data , ggml_nelements (dst ) * GGML_TYPE_SIZE [src0 -> type ]);
4971
5024
return ;
4972
5025
}
4973
5026
4974
- if (src0 -> nb [0 ] == sizeof (float )) {
4975
- if (dst -> type == GGML_TYPE_F32 ) {
4976
- size_t id = 0 ;
4977
- const size_t rs = ne00 * nb00 ;
4978
-
4979
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4980
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4981
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4982
- const char * src0_ptr = (char * ) src0 -> data + i01 * nb01 + i02 * nb02 + i03 * nb03 ;
4983
- char * dst_ptr = (char * ) dst -> data + id * rs ;
5027
+ // dst counters
5028
+ int64_t i10 = 0 ;
5029
+ int64_t i11 = 0 ;
5030
+ int64_t i12 = 0 ;
5031
+ int64_t i13 = 0 ;
4984
5032
4985
- memcpy (dst_ptr , src0_ptr , rs );
4986
-
4987
- id ++ ;
4988
- }
4989
- }
4990
- }
4991
- } else if (dst -> type == GGML_TYPE_F16 ) {
4992
- size_t id = 0 ;
4993
- ggml_fp16_t * dst_ptr = (ggml_fp16_t * ) dst -> data ;
4994
-
4995
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
4996
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
4997
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
4998
- for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
4999
- const float * src0_ptr = (float * ) ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
5000
-
5001
- dst_ptr [id ] = GGML_FP32_TO_FP16 (* src0_ptr );
5002
- id ++ ;
5033
+ if (dst -> type == GGML_TYPE_F32 ) {
5034
+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
5035
+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
5036
+ for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
5037
+ for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
5038
+ const char * src0_ptr = ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
5039
+ char * dst_ptr = ((char * ) dst -> data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3 );
5040
+
5041
+ memcpy (dst_ptr , src0_ptr , sizeof (float ));
5042
+
5043
+ if (++ i10 == dst -> ne [0 ]) {
5044
+ i10 = 0 ;
5045
+ if (++ i11 == dst -> ne [1 ]) {
5046
+ i11 = 0 ;
5047
+ if (++ i12 == dst -> ne [2 ]) {
5048
+ i12 = 0 ;
5049
+ if (++ i13 == dst -> ne [3 ]) {
5050
+ i13 = 0 ;
5051
+ }
5052
+ }
5053
+ }
5003
5054
}
5004
5055
}
5005
5056
}
5006
5057
}
5007
- } else {
5008
- GGML_ASSERT (false); // TODO: implement
5009
5058
}
5010
- } else {
5011
- //printf("%s: this is not optimal - fix me\n", __func__);
5012
-
5013
- if (dst -> type == GGML_TYPE_F32 ) {
5014
- size_t id = 0 ;
5015
- float * dst_ptr = (float * ) dst -> data ;
5016
-
5017
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
5018
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
5019
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
5020
- for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
5021
- const float * src0_ptr = (float * ) ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
5022
-
5023
- dst_ptr [id ] = * src0_ptr ;
5024
- id ++ ;
5025
- }
5026
- }
5027
- }
5028
- }
5029
- } else if (dst -> type == GGML_TYPE_F16 ) {
5030
- size_t id = 0 ;
5031
- ggml_fp16_t * dst_ptr = (ggml_fp16_t * ) dst -> data ;
5032
-
5033
- for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
5034
- for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
5035
- for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
5036
- for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
5037
- const float * src0_ptr = (float * ) ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
5038
-
5039
- dst_ptr [id ] = GGML_FP32_TO_FP16 (* src0_ptr );
5040
- id ++ ;
5059
+ } else if (dst -> type == GGML_TYPE_F16 ) {
5060
+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
5061
+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
5062
+ for (int64_t i01 = 0 ; i01 < ne01 ; i01 ++ ) {
5063
+ for (int64_t i00 = 0 ; i00 < ne00 ; i00 ++ ) {
5064
+ const char * src0_ptr = ((char * ) src0 -> data + i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03 );
5065
+ char * dst_ptr = ((char * ) dst -> data + i10 * nb0 + i11 * nb1 + i12 * nb2 + i13 * nb3 );
5066
+
5067
+ * (ggml_fp16_t * ) dst_ptr = GGML_FP32_TO_FP16 (* (const float * ) src0_ptr );
5068
+
5069
+ if (++ i10 == dst -> ne [0 ]) {
5070
+ i10 = 0 ;
5071
+ if (++ i11 == dst -> ne [1 ]) {
5072
+ i11 = 0 ;
5073
+ if (++ i12 == dst -> ne [2 ]) {
5074
+ i12 = 0 ;
5075
+ if (++ i13 == dst -> ne [3 ]) {
5076
+ i13 = 0 ;
5077
+ }
5078
+ }
5079
+ }
5041
5080
}
5042
5081
}
5043
5082
}
5044
5083
}
5045
- } else {
5046
- GGML_ASSERT (false); // TODO: implement
5047
5084
}
5085
+ } else {
5086
+ GGML_ASSERT (false); // TODO: implement
5048
5087
}
5049
5088
}
5050
5089
0 commit comments