Skip to content

Commit 2f3c098

Browse files
authored
fix: Address mypy typing errors in v2 SDK (#157)
Fix all mypy errors due to incorrect typing after the SDK v2 merge (#135). Logic changes should be minimal, this is mostly to change type hints where a `httpx.Response` is used instead of a `requests.Response`, etc. I removed some `form_utils.py` functions where we no longer need to convert a httpx request back to Requests. There's more we can cleanup in here, but let's get the V2 migration settled first. Add mypy to `make lint` so that we can cath these errors before merging. The publish job runs a full linter suite, and these changes made it to main but broke the publish job. Also, remove the Patch Custom Code step that I added to the generate. This broke the job. There are some minor changes to the Speakeasy code on the main branch. In the short term, this means we'll have to run `make patch-custom-code` whenever we regenerate. # To verify Make sure you can lint and run the tests locally. `make lint` and `make test`. You can also verify that the pdf split behavior has not changed with a call to your local server: ``` from unstructured_client import UnstructuredClient from unstructured_client.models import shared, operations import json filename = "_sample_docs/layout-parser-paper.pdf" s = UnstructuredClient( server_url="http://localhost:8000", ) with open(filename, "rb") as f: files=shared.Files( content=f, file_name=filename, ) req = operations.PartitionRequest( shared.PartitionParameters( files=files, strategy="fast", split_pdf_page_range=[4,8], ), ) resp = s.general.partition(req) print(json.dumps(resp.elements, indent=4)) ```
1 parent 793145e commit 2f3c098

File tree

7 files changed

+80
-176
lines changed

7 files changed

+80
-176
lines changed

.github/workflows/speakeasy_sdk_generation.yml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,3 @@ jobs:
2424
github_access_token: ${{ secrets.GITHUB_TOKEN }}
2525
pypi_token: ${{ secrets.PYPI_TOKEN }}
2626
speakeasy_api_key: ${{ secrets.SPEAKEASY_API_KEY }}
27-
patch-custom-code:
28-
runs-on: ubuntu-latest
29-
needs: [generate]
30-
steps:
31-
- name: Patch in custom code after regenerating
32-
run: make patch-custom-code
33-

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ install-test:
1313

1414
.PHONY: install-dev
1515
install-dev:
16-
pip install jupyter
17-
pip install pylint
16+
pip install jupyter uvloop pylint mypy
1817

1918
## install: installs all test, dev, and experimental requirements
2019
.PHONY: install
@@ -48,6 +47,7 @@ test-integration-docker:
4847
.PHONY: lint
4948
lint:
5049
pylint --rcfile=pylintrc src
50+
mypy src
5151

5252
#############
5353
# Speakeasy #

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -126,57 +126,6 @@ def test_unit_create_response():
126126
assert response.headers.get("Content-Length"), expected_content_length
127127

128128

129-
def test_unit_create_request():
130-
"""Test create request method properly sets file, Content-Type and Content-Length headers.
131-
List parameters should be flattened in the body."""
132-
133-
# Prepare test data
134-
request = requests.PreparedRequest()
135-
request.headers = {
136-
"Content-Type": "application/json",
137-
"Authorization": "Bearer token",
138-
}
139-
form_data = {
140-
"parameter_1": "value_1",
141-
"parameter_2": "value_2",
142-
"list_parameter": ["value_1", "value_2"],
143-
}
144-
page = (io.BytesIO(b"page_content"), 1)
145-
filename = "test_file.pdf"
146-
147-
# Expected results
148-
expected_page_filename = "test_file.pdf"
149-
expected_body = MultipartEncoder(
150-
fields=[
151-
("parameter_1", "value_1"),
152-
("parameter_2", "value_2"),
153-
("list_parameter", "value_1"),
154-
("list_parameter", "value_2"),
155-
("split_pdf_page", "false"),
156-
("starting_page_number", "7"),
157-
("files", (
158-
expected_page_filename,
159-
page[0],
160-
"application/pdf",
161-
)),
162-
]
163-
)
164-
expected_url = ""
165-
166-
# Create request
167-
body = request_utils.create_request_body(form_data, page[0], filename, 7)
168-
request_obj = request_utils.create_request(request, body)
169-
request_content_type: str = request_obj.headers.get("Content-Type")
170-
# Assert the request object
171-
assert request_obj.method == "POST"
172-
assert request_obj.url == expected_url
173-
174-
# Validate fields ignoring order
175-
assert set(request_obj.data.fields) == set(expected_body.fields)
176-
177-
assert request_content_type.startswith("multipart/form-data")
178-
179-
180129
def test_unit_decode_content_disposition():
181130
"""Test decode content disposition method properly decodes Content-Disposition header."""
182131

@@ -362,13 +311,13 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz
362311
("form_data", "expected_result"),
363312
[
364313
({}, DEFAULT_CONCURRENCY_LEVEL), # no value
365-
({"split_pdf_concurrency_level": 10}, 10), # valid number
314+
({"split_pdf_concurrency_level": "10"}, 10), # valid number
366315
(
367316
# exceeds max value
368317
{"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"},
369318
MAX_CONCURRENCY_LEVEL,
370319
),
371-
({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL), # negative value
320+
({"split_pdf_concurrency_level": "-3"}, DEFAULT_CONCURRENCY_LEVEL), # negative value
372321
],
373322
)
374323
def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result):

src/unstructured_client/_hooks/custom/form_utils.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
from typing import Union
55

6-
from requests_toolbelt.multipart.decoder import MultipartDecoder
6+
from requests_toolbelt.multipart.decoder import MultipartDecoder # type: ignore
77

88
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
99
from unstructured_client.models import shared
@@ -35,7 +35,7 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int,
3535
try:
3636
_page_range = form_data.get(key)
3737

38-
if _page_range is not None:
38+
if isinstance(_page_range, list):
3939
page_range = (int(_page_range[0]), int(_page_range[1]))
4040
else:
4141
page_range = (1, max_pages)
@@ -108,7 +108,7 @@ def get_split_pdf_allow_failed_param(
108108
"""
109109
allow_failed = form_data.get(key)
110110

111-
if allow_failed is None:
111+
if not isinstance(allow_failed, str):
112112
return fallback_value
113113

114114
if allow_failed.lower() not in ["true", "false"]:
@@ -121,6 +121,7 @@ def get_split_pdf_allow_failed_param(
121121

122122
return allow_failed.lower() == "true"
123123

124+
124125
def get_split_pdf_concurrency_level_param(
125126
form_data: FormData, key: str, fallback_value: int, max_allowed: int
126127
) -> int:
@@ -140,7 +141,7 @@ def get_split_pdf_concurrency_level_param(
140141
"""
141142
concurrency_level_str = form_data.get(key)
142143

143-
if concurrency_level_str is None:
144+
if not isinstance(concurrency_level_str, str):
144145
return fallback_value
145146

146147
try:
@@ -218,10 +219,12 @@ def parse_form_data(decoded_data: MultipartDecoder) -> FormData:
218219
else:
219220
content = part.content.decode()
220221
if name in form_data:
221-
if isinstance(form_data[name], list):
222-
form_data[name].append(content)
222+
form_data_value = form_data[name]
223+
if isinstance(form_data_value, list):
224+
form_data_value.append(content)
223225
else:
224-
form_data[name] = [form_data[name], content]
226+
new_list = [form_data_value, content]
227+
form_data[name] = new_list
225228
else:
226229
form_data[name] = content
227230

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
import logging
3-
from typing import Generator, Tuple, Optional
3+
from typing import cast, Generator, Tuple, Optional
44

55
from pypdf import PdfReader, PdfWriter
66
from pypdf.errors import PdfReadError
@@ -70,7 +70,8 @@ def is_pdf(file: shared.Files) -> bool:
7070
return False
7171

7272
try:
73-
PdfReader(io.BytesIO(file.content), strict=True)
73+
content = cast(bytes, file.content)
74+
PdfReader(io.BytesIO(content), strict=True)
7475
except (PdfReadError, UnicodeDecodeError) as exc:
7576
logger.error(exc)
7677
logger.warning("The file does not appear to be a valid PDF.")

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 21 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,10 @@
55
import io
66
import json
77
import logging
8-
from typing import Optional, Tuple, Any
8+
from typing import Tuple, Any
99

1010
import httpx
11-
import requests
12-
from requests.structures import CaseInsensitiveDict
13-
from requests_toolbelt.multipart.encoder import MultipartEncoder
11+
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore
1412

1513
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
1614
from unstructured_client._hooks.custom.form_utils import (
@@ -51,76 +49,33 @@ def create_request_body(
5149
return body
5250

5351

54-
def create_httpx_request(
55-
original_request: requests.Request, body: MultipartEncoder
56-
) -> httpx.Request:
57-
headers = prepare_request_headers(original_request.headers)
58-
return httpx.Request(
59-
method="POST",
60-
url=original_request.url or "",
61-
content=body.to_string(),
62-
headers={**headers, "Content-Type": body.content_type},
63-
)
64-
65-
66-
def create_request(
67-
request: requests.PreparedRequest,
68-
body: MultipartEncoder,
69-
) -> requests.Request:
70-
headers = prepare_request_headers(request.headers)
71-
return requests.Request(
72-
method="POST",
73-
url=request.url or "",
74-
data=body,
75-
headers={**headers, "Content-Type": body.content_type},
76-
)
77-
78-
7952
async def call_api_async(
8053
client: httpx.AsyncClient,
8154
page: Tuple[io.BytesIO, int],
82-
original_request: requests.Request,
55+
original_request: httpx.Request,
8356
form_data: FormData,
8457
filename: str,
8558
limiter: asyncio.Semaphore,
86-
) -> tuple[int, dict]:
59+
) -> httpx.Response:
8760
page_content, page_number = page
8861
body = create_request_body(form_data, page_content, filename, page_number)
89-
new_request = create_httpx_request(original_request, body)
90-
async with limiter:
91-
try:
92-
response = await client.send(new_request)
93-
return response.status_code, response.json()
94-
except Exception:
95-
logger.error("Failed to send request for page %d", page_number)
96-
return 500, {}
97-
62+
original_headers = prepare_request_headers(original_request.headers)
9863

99-
def call_api(
100-
client: Optional[requests.Session],
101-
page: Tuple[io.BytesIO, int],
102-
request: requests.PreparedRequest,
103-
form_data: FormData,
104-
filename: str,
105-
) -> requests.Response:
106-
if client is None:
107-
raise RuntimeError("HTTP client not accessible!")
108-
page_content, page_number = page
109-
110-
body = create_request_body(form_data, page_content, filename, page_number)
111-
new_request = create_request(request, body)
112-
prepared_request = client.prepare_request(new_request)
64+
new_request = httpx.Request(
65+
method="POST",
66+
url=original_request.url or "",
67+
content=body.to_string(),
68+
headers={**original_headers, "Content-Type": body.content_type},
69+
)
11370

114-
try:
115-
return client.send(prepared_request)
116-
except Exception:
117-
logger.error("Failed to send request for page %d", page_number)
118-
return requests.Response()
71+
async with limiter:
72+
response = await client.send(new_request)
73+
return response
11974

12075

12176
def prepare_request_headers(
122-
headers: CaseInsensitiveDict[str],
123-
) -> CaseInsensitiveDict[str]:
77+
headers: httpx.Headers,
78+
) -> httpx.Headers:
12479
"""Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers.
12580
12681
Args:
@@ -129,10 +84,10 @@ def prepare_request_headers(
12984
Returns:
13085
The modified request headers.
13186
"""
132-
headers = copy.deepcopy(headers)
133-
headers.pop("Content-Type", None)
134-
headers.pop("Content-Length", None)
135-
return headers
87+
new_headers = headers.copy()
88+
new_headers.pop("Content-Type", None)
89+
new_headers.pop("Content-Length", None)
90+
return new_headers
13691

13792

13893
def prepare_request_payload(form_data: FormData) -> FormData:
@@ -157,7 +112,7 @@ def prepare_request_payload(form_data: FormData) -> FormData:
157112
return payload
158113

159114

160-
def create_response(response: requests.Response, elements: list) -> requests.Response:
115+
def create_response(response: httpx.Response, elements: list) -> httpx.Response:
161116
"""
162117
Creates a modified response object with updated content.
163118

0 commit comments

Comments
 (0)