@@ -133,69 +133,29 @@ def make_generate_function(self):
133133
134134 self .generate_function = self .generate_step
135135 if keras .config .backend () == "openvino" :
136- import os
137- from multiprocessing import Pipe
138- from multiprocessing import Process
139-
140136 import openvino as ov
141137 import openvino .runtime .opset14 as ov_opset
142- import psutil
143138
144139 from keras_hub .src .utils .keras_utils import print_msg
145140
146141 def ov_infer (inputs , stop_token_ids , fn ):
147- def isolated_infer (pipe , compiled_model , flat_inputs ):
148- outputs = compiled_model (flat_inputs )
149- outputs = outputs .to_tuple ()
150- pipe .send (outputs )
151- pipe .close ()
152-
153142 def get_outputs (inputs , struct_outputs , compiled_ov_model ):
154143 flatten_inputs = tree .flatten (inputs )
155- free_mem = psutil .virtual_memory ().available / (1024 ** 3 )
156- # On average OpenVINO needs about 2 GB to run
157- # an inference, also it is wrapped by an env var,
158- # to be tuned.
159- threshold = float (
160- os .getenv ("OV_INFER_FREE_MEM_THRESHOLD" , 2 )
161- )
162- if free_mem > threshold :
163- """Run inference in a separate process only if
164- free memory usage is above a certain threshold.
165- This threshold is calculated to ensure that
166- swap memory won't be triggered. When swap is
167- likely to be used, fallback to normal inference
168- to avoid severe performance degradation.
169- Running inference in a subprocess prevents OpenVINO from
170- allocating extra memory in the main process during its
171- internal infer request creation. This can reduce memory
172- usage by 0.5–2 GB depending on the model size.
173- However, using a subprocess introduces an extra
174- overhead, increasing latency by around 1–2 seconds
175- per inference.
176- """
177- parent_conn , child_conn = Pipe ()
178- p = Process (
179- target = isolated_infer ,
180- args = (
181- child_conn ,
182- compiled_ov_model ,
183- flatten_inputs ,
184- ),
185- )
186- p .start ()
187- outputs = parent_conn .recv ()
188- p .join ()
189- else :
190- outputs = compiled_ov_model (flatten_inputs )
191- outputs = outputs .to_tuple ()
144+ outputs = compiled_ov_model (flatten_inputs ).to_tuple ()
192145 outputs = self ._unpack_singleton (
193146 tree .pack_sequence_as (struct_outputs , outputs )
194147 )
195148 return outputs
196149
150+ core = ov .Core ()
151+ device = "GPU" if "GPU" in core .available_devices else "CPU"
152+
197153 # Try using the existing compiled model
198- if self .ov_compiled_model is not None :
154+ if (
155+ self .ov_compiled_model is not None
156+ and getattr (self , "ov_device" , None ) is not None
157+ and device == self .ov_device
158+ ):
199159 try :
200160 return get_outputs (
201161 inputs , self .struct_outputs , self .ov_compiled_model
@@ -228,10 +188,17 @@ def get_outputs(inputs, struct_outputs, compiled_ov_model):
228188 ov .PartialShape ([- 1 ] * rank )
229189 )
230190 ov_model .validate_nodes_and_infer_types ()
231- core = ov .Core ()
232- device = "CPU"
233- # OpenVINO supports only compiling with 'CPU' devices.
234- self .ov_compiled_model = core .compile_model (ov_model , device )
191+
192+ self .ov_device = device
193+ model_dtype = (
194+ "f16"
195+ if self .dtype == "float16" or self .dtype == "bfloat16"
196+ else "f32"
197+ )
198+ config = {"INFERENCE_PRECISION_HINT" : model_dtype }
199+ self .ov_compiled_model = core .compile_model (
200+ ov_model , device , config
201+ )
235202 return get_outputs (
236203 inputs , self .struct_outputs , self .ov_compiled_model
237204 )
0 commit comments