centralize the determination of the string encoding

andlaus · andlaus · commit 7de4a8149007 · 2025-01-20T10:48:16.000+01:00
this is surprisingly complicated. Centralizing this code into a
`get_string_encoding()` function allows to reduce the amount of
copy-and-pasted code and implement the `MinMaxLengthType` diag coded
type properly.

thanks to [at]kayoub5 for insisting on this.

Signed-off-by: Andreas Lauser &lt;andreas.lauser@mercedes-benz.com&gt;
diff --git a/odxtools/decodestate.py b/odxtools/decodestate.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
-from .encoding import Encoding
+from .encoding import Encoding, get_string_encoding
 from .exceptions import DecodeError, odxassert, odxraise, strict_mode
 from .odxtypes import AtomicOdxType, DataType, ParameterValue
 
@@ -114,29 +114,13 @@ def extract_atomic_value(
         # ... string types, ...
         elif base_data_type in (DataType.A_UTF8STRING, DataType.A_ASCIISTRING,
                                 DataType.A_UNICODE2STRING):
-            # note that the spec disallows certain combinations of
-            # base_data_type and encoding (e.g., A_ASCIISTRING encoded
-            # using UTF-8). Since in python3 strings are always
-            # capable of the full unicode character set, odxtools
-            # ignores these restrictions...
             text_errors = 'strict' if strict_mode else 'replace'
-            if base_type_encoding == Encoding.UTF8 or (base_data_type == DataType.A_UTF8STRING and
-                                                       base_type_encoding is None):
-                internal_value = raw_value.decode("utf-8", errors=text_errors)
-            elif base_type_encoding == Encoding.UCS2 or (base_data_type == DataType.A_UNICODE2STRING
-                                                         and base_type_encoding is None):
-                text_encoding = "utf-16-be" if is_highlow_byte_order else "utf-16-le"
-                internal_value = raw_value.decode(text_encoding, errors=text_errors)
-            elif base_type_encoding == Encoding.ISO_8859_1 or (
-                    base_data_type == DataType.A_ASCIISTRING and base_type_encoding is None):
-                internal_value = raw_value.decode("iso-8859-1", errors=text_errors)
-            elif base_type_encoding == Encoding.ISO_8859_2:
-                internal_value = raw_value.decode("iso-8859-2", errors=text_errors)
-            elif base_type_encoding == Encoding.WINDOWS_1252:
-                internal_value = raw_value.decode("cp1252", errors=text_errors)
+            str_encoding = get_string_encoding(base_data_type, base_type_encoding,
+                                               is_highlow_byte_order)
+            if str_encoding is not None:
+                internal_value = raw_value.decode(str_encoding, errors=text_errors)
             else:
-                odxraise(f"Specified illegal encoding {base_type_encoding} for string object")
-                internal_value = raw_value.decode("iso-8859-1", errors=text_errors)
+                internal_value = "ERROR"
 
         # ... signed integers, ...
         elif base_data_type == DataType.A_INT32:
diff --git a/odxtools/encodestate.py b/odxtools/encodestate.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, SupportsBytes, Tuple
 
-from .encoding import Encoding
+from .encoding import Encoding, get_string_encoding
 from .exceptions import EncodeError, OdxWarning, odxassert, odxraise
 from .odxtypes import AtomicOdxType, DataType, ParameterValue
 
@@ -115,29 +115,12 @@ def emplace_atomic_value(
             if not isinstance(internal_value, str):
                 odxraise(f"The internal value {internal_value!r} is not a string", EncodeError)
 
-            # note that the spec disallows certain combinations of
-            # base_data_type and encoding (e.g., A_ASCIISTRING encoded
-            # using UTF-8). Since in python3 strings are always
-            # capable of the full unicode character set, odxtools
-            # ignores these restrictions...
-
-            if base_type_encoding == Encoding.UTF8 or (base_data_type == DataType.A_UTF8STRING and
-                                                       base_type_encoding is None):
-                raw_value = internal_value.encode("utf-8")
-            elif base_type_encoding == Encoding.UCS2 or (base_data_type == DataType.A_UNICODE2STRING
-                                                         and base_type_encoding is None):
-                text_encoding = "utf-16-be" if is_highlow_byte_order else "utf-16-le"
-                raw_value = internal_value.encode(text_encoding)
-            elif base_type_encoding == Encoding.ISO_8859_1 or (
-                    base_data_type == DataType.A_ASCIISTRING and base_type_encoding is None):
-                raw_value = internal_value.encode("iso-8859-1")
-            elif base_type_encoding == Encoding.ISO_8859_2:
-                raw_value = internal_value.encode("iso-8859-2")
-            elif base_type_encoding == Encoding.WINDOWS_1252:
-                raw_value = internal_value.encode("cp1252")
+            str_encoding = get_string_encoding(base_data_type, base_type_encoding,
+                                               is_highlow_byte_order)
+            if str_encoding is not None:
+                raw_value = internal_value.encode(str_encoding)
             else:
-                odxraise(f"Specified illegal encoding {base_type_encoding} for string object")
-                raw_value = internal_value.encode("iso-8859-1")
+                raw_value = b""
 
             if 8 * len(raw_value) > bit_length:
                 odxraise(
diff --git a/odxtools/encoding.py b/odxtools/encoding.py
@@ -1,5 +1,9 @@
 # SPDX-License-Identifier: MIT
 from enum import Enum
+from typing import Optional
+
+from .exceptions import odxraise
+from .odxtypes import DataType
 
 
 class Encoding(Enum):
@@ -17,3 +21,36 @@ class Encoding(Enum):
     WINDOWS_1252 = "WINDOWS-1252"
 
     NONE = "NONE"
+
+
+def get_string_encoding(base_data_type: DataType, base_type_encoding: Optional[Encoding],
+                        is_highlow_byte_order: bool) -> Optional[str]:
+    """If the encoding is for a string, return the value for
+    `str.encode()`/`str.decode()` to convert the string object
+    to/from a byte array
+    """
+
+    # note that the spec disallows certain combinations of
+    # base_data_type and encoding (e.g., A_ASCIISTRING encoded
+    # using UTF-8). Since in python3 strings are always
+    # capable of the full unicode character set, odxtools
+    # ignores these restrictions...
+    if base_type_encoding == Encoding.UTF8 or (base_data_type == DataType.A_UTF8STRING and
+                                               base_type_encoding is None):
+        return "utf-8"
+    elif base_type_encoding == Encoding.UCS2 or (base_data_type == DataType.A_UNICODE2STRING and
+                                                 base_type_encoding is None):
+        return "utf-16-be" if is_highlow_byte_order else "utf-16-le"
+    elif base_type_encoding == Encoding.ISO_8859_1 or (base_data_type == DataType.A_ASCIISTRING and
+                                                       base_type_encoding is None):
+        return "iso-8859-1"
+    elif base_type_encoding == Encoding.ISO_8859_2:
+        return "iso-8859-2"
+    elif base_type_encoding == Encoding.WINDOWS_1252:
+        return "cp1252"
+    else:
+        odxraise(f"Specified illegal encoding {base_type_encoding} for {base_data_type.value} "
+                 f"string object")
+        return "iso-8859-1"
+
+    return None
diff --git a/odxtools/minmaxlengthtype.py b/odxtools/minmaxlengthtype.py
@@ -8,6 +8,7 @@
 from .decodestate import DecodeState
 from .diagcodedtype import DctType, DiagCodedType
 from .encodestate import EncodeState
+from .encoding import get_string_encoding
 from .exceptions import DecodeError, EncodeError, odxassert, odxraise, odxrequire
 from .odxlink import OdxDocFragment
 from .odxtypes import AtomicOdxType, DataType
@@ -78,11 +79,21 @@ def encode_into_pdu(self, internal_value: AtomicOdxType, encode_state: EncodeSta
             odxraise("MinMaxLengthType is currently only implemented for strings and byte arrays",
                      EncodeError)
 
-        # TODO: This assumes that each character of a string is
-        #       encoded into a single byte. This is never the case for
-        #       UTF-16 encoded strings and does not always hold for
-        #       UTF-8. We ignore this issue for now...
-        data_length = len(internal_value)
+        raw_value = b''
+        if isinstance(internal_value, str):
+            str_encoding = get_string_encoding(self.base_data_type, self.base_type_encoding,
+                                               self.is_highlow_byte_order)
+
+            if str_encoding is None:
+                odxraise(f"Internal string value specified for object which is "
+                         f"'{self.base_data_type.value}' not a string")
+                raw_value = b''
+            else:
+                raw_value = internal_value.encode(str_encoding)
+        else:
+            raw_value = bytes(internal_value)
+
+        data_length = len(raw_value)
 
         if data_length < self.min_length:
             odxraise(
@@ -98,12 +109,12 @@ def encode_into_pdu(self, internal_value: AtomicOdxType, encode_state: EncodeSta
             data_length = self.max_length
 
         encode_state.emplace_atomic_value(
-            internal_value=internal_value,
+            internal_value=raw_value,
             used_mask=None,
             bit_length=8 * data_length,
-            base_data_type=self.base_data_type,
-            base_type_encoding=self.base_type_encoding,
-            is_highlow_byte_order=self.is_highlow_byte_order,
+            base_data_type=DataType.A_BYTEFIELD,
+            base_type_encoding=None,
+            is_highlow_byte_order=True,
         )
 
         # TODO: ensure that the termination delimiter is not