@@ -207,7 +207,7 @@ def process_outputs(self, prev_nodes: List, result_dict: Dict) -> Dict:
207
207
all_outputs .update (result_dict [prev_node ])
208
208
return all_outputs
209
209
210
- async def wrap_iterable (self , aiterable , is_first = True ):
210
+ def wrap_iterable (self , iterable , is_first = True ):
211
211
212
212
with tracer .start_as_current_span ("llm_generate_stream" ) if ENABLE_OPEA_TELEMETRY else contextlib .nullcontext ():
213
213
while True :
@@ -217,10 +217,10 @@ async def wrap_iterable(self, aiterable, is_first=True):
217
217
else contextlib .nullcontext ()
218
218
): # else tracer.start_as_current_span(f"llm_generate_stream_next_token")
219
219
try :
220
- token = await anext ( aiterable )
220
+ token = next ( iterable )
221
221
yield token
222
222
is_first = False
223
- except StopAsyncIteration :
223
+ except StopIteration :
224
224
# Exiting the iterable loop cleanly
225
225
break
226
226
except Exception as e :
@@ -259,51 +259,49 @@ async def execute(
259
259
if ENABLE_OPEA_TELEMETRY
260
260
else contextlib .nullcontext ()
261
261
):
262
- async with aiohttp . ClientSession () as session :
263
- response = await session . post (
264
- url = endpoint ,
265
- data = json . dumps ( inputs ) ,
266
- headers = {"Content-type " : "application/json" },
267
- proxy = None ,
268
- timeout = aiohttp . ClientTimeout ( total = 1000 ) ,
269
- )
262
+ response = requests . post (
263
+ url = endpoint ,
264
+ data = json . dumps ( inputs ) ,
265
+ headers = { "Content-type" : "application/ json" } ,
266
+ proxies = {"http " : None },
267
+ stream = True ,
268
+ timeout = 1000 ,
269
+ )
270
270
downstream = runtime_graph .downstream (cur_node )
271
271
if downstream :
272
272
assert len (downstream ) == 1 , "Not supported multiple stream downstreams yet!"
273
273
cur_node = downstream [0 ]
274
274
hitted_ends = ["." , "?" , "!" , "。" , "," , "!" ]
275
275
downstream_endpoint = self .services [downstream [0 ]].endpoint_path
276
276
277
- async def generate ():
277
+ def generate ():
278
278
token_start = req_start
279
279
if response :
280
280
# response.elapsed = time until first headers received
281
281
buffered_chunk_str = ""
282
282
is_first = True
283
- async for chunk in self .wrap_iterable (response .content . iter_chunked ( None )):
283
+ for chunk in self .wrap_iterable (response .iter_content ( chunk_size = None )):
284
284
if chunk :
285
285
if downstream :
286
286
chunk = chunk .decode ("utf-8" )
287
287
buffered_chunk_str += self .extract_chunk_str (chunk )
288
288
is_last = chunk .endswith ("[DONE]\n \n " )
289
289
if (buffered_chunk_str and buffered_chunk_str [- 1 ] in hitted_ends ) or is_last :
290
- async with aiohttp .ClientSession () as downstream_session :
291
- res = await downstream_session .post (
292
- url = downstream_endpoint ,
293
- data = json .dumps ({"text" : buffered_chunk_str }),
294
- proxy = None ,
295
- )
296
- res_json = await res .json ()
297
- if "text" in res_json :
298
- res_txt = res_json ["text" ]
299
- else :
300
- raise Exception ("Other response types not supported yet!" )
301
- buffered_chunk_str = "" # clear
302
- async for item in self .token_generator (
303
- res_txt , token_start , is_first = is_first , is_last = is_last
304
- ):
305
- yield item
306
- token_start = time .time ()
290
+ res = requests .post (
291
+ url = downstream_endpoint ,
292
+ data = json .dumps ({"text" : buffered_chunk_str }),
293
+ proxies = {"http" : None },
294
+ )
295
+ res_json = res .json ()
296
+ if "text" in res_json :
297
+ res_txt = res_json ["text" ]
298
+ else :
299
+ raise Exception ("Other response types not supported yet!" )
300
+ buffered_chunk_str = "" # clear
301
+ yield from self .token_generator (
302
+ res_txt , token_start , is_first = is_first , is_last = is_last
303
+ )
304
+ token_start = time .time ()
307
305
else :
308
306
token_start = self .metrics .token_update (token_start , is_first )
309
307
yield chunk
0 commit comments