Skip to content

Commit 24f5021

Browse files
authored
Improve dataprep CI and fix pptx file ingesting bug (opea-project#1334)
- Fix permission issue for when ingesting pptx file with embedded image - Add more test coverage to the dataprep CI and unify common dataprep CI test code for DB backends: qdrant, milvus, redis, pgvector Signed-off-by: Lianhao Lu <[email protected]>
1 parent cbb8a82 commit 24f5021

File tree

13 files changed

+248
-278
lines changed

13 files changed

+248
-278
lines changed

comps/dataprep/src/utils.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,12 @@ async def load_pptx(pptx_path):
257257
if table_contents:
258258
text += table_contents + "\n"
259259
if hasattr(shape, "image") and hasattr(shape.image, "blob"):
260-
img_path = f"./{shape.image.filename}"
261-
with open(img_path, "wb") as f:
260+
with tempfile.NamedTemporaryFile() as f:
262261
f.write(shape.image.blob)
263-
img_text = await load_image(img_path)
264-
if img_text:
265-
text += img_text + "\n"
266-
os.remove(img_path)
262+
f.flush()
263+
img_text = await load_image(f.name)
264+
if img_text:
265+
text += img_text + "\n"
267266
return text
268267

269268

comps/third_parties/opensearch/deployment/docker_compose/compose.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ services:
2828
- "${OPENSEARCH_PORT1:-9200}:9200"
2929
- "${OPENSEARCH_PORT2:-9600}:9600"
3030
healthcheck:
31-
test: ["CMD-SHELL", "sleep 10 && exit 0"]
32-
interval: 1s
31+
test: ["CMD-SHELL", "curl -f https://localhost:9200 -ku 'admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD' || exit 1"]
32+
interval: 5s
3333
timeout: 15s
34-
retries: 1
34+
retries: 12

tests/dataprep/dataprep_utils.sh

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (C) 2025 Intel Corporation
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
7+
8+
# call_curl <url> <http_header> <remaining params>
9+
function call_curl() {
10+
local url=$1
11+
local header=$2
12+
shift 2
13+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -H "$header" "${url}" $@)
14+
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
15+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
16+
}
17+
18+
# _invoke_curl <fqdn> <port> <action> <remaining params passed to curl ...>
19+
function _invoke_curl() {
20+
local url="http://$1:$2/v1/dataprep/$3"
21+
local action=$3
22+
shift 3
23+
local header=""
24+
case $action in
25+
ingest)
26+
header='Content-Type: multipart/form-data'
27+
;;
28+
delete|get)
29+
header='Content-Type: application/json'
30+
;;
31+
*)
32+
echo "Error: Unsupported dataprep action $action!"
33+
exit 1
34+
;;
35+
esac
36+
37+
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H "$header" "${url}" $@)
38+
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
39+
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
40+
}
41+
42+
# validate_ingest <service fqdn> <port>
43+
function ingest_doc() {
44+
local fqdn=$1
45+
local port=$2
46+
shift 2
47+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.doc" $@
48+
}
49+
50+
function ingest_docx() {
51+
local fqdn=$1
52+
local port=$2
53+
shift 2
54+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.docx" $@
55+
}
56+
57+
function ingest_pdf() {
58+
local fqdn=$1
59+
local port=$2
60+
shift 2
61+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pdf" $@
62+
}
63+
64+
function ingest_pptx() {
65+
local fqdn=$1
66+
local port=$2
67+
shift 2
68+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pptx" $@
69+
}
70+
71+
function ingest_txt() {
72+
local fqdn=$1
73+
local port=$2
74+
shift 2
75+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.txt" $@
76+
}
77+
78+
function ingest_xlsx() {
79+
local fqdn=$1
80+
local port=$2
81+
shift 2
82+
_invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.xlsx" $@
83+
}
84+
85+
function ingest_external_link() {
86+
local fqdn=$1
87+
local port=$2
88+
shift 2
89+
_invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' $@
90+
}
91+
92+
function delete_all() {
93+
local fqdn=$1
94+
local port=$2
95+
shift 2
96+
_invoke_curl $fqdn $port delete -d '{"file_path":"all"}' $@
97+
}
98+
99+
function delete_single() {
100+
local fqdn=$1
101+
local port=$2
102+
shift 3
103+
_invoke_curl $fqdn $port delete -d '{"file_path":"ingest_dataprep.txt"}' $@
104+
}
105+
106+
function get_all() {
107+
local fqdn=$1
108+
local port=$2
109+
shift 2
110+
_invoke_curl $fqdn $port get $@
111+
}
112+
113+
function check_result() {
114+
local service_name=$1
115+
local expected_response=$2
116+
local container_name=$3
117+
local logfile=$4
118+
local http_status="${5:-200}"
119+
120+
if [ "$HTTP_STATUS" -ne ${http_status} ]; then
121+
echo "[ $service_name ] HTTP status is not ${http_status}. Received status was $HTTP_STATUS"
122+
docker logs $container_name >> $logfile
123+
exit 1
124+
else
125+
echo "[ $service_name ] HTTP status is ${http_status}. Checking content..."
126+
fi
127+
128+
# check response body
129+
if [[ "$RESPONSE_BODY" != *${expected_response}* ]]; then
130+
echo "[ $service_name ] Content does not match the expected result: $RESPONSE_BODY"
131+
docker logs $container_name >> $logfile
132+
exit 1
133+
else
134+
echo "[ $service_name ] Content is as expected."
135+
fi
136+
}

tests/dataprep/ingest_dataprep.doc

38.5 KB
Binary file not shown.

tests/dataprep/ingest_dataprep.docx

336 KB
Binary file not shown.

tests/dataprep/ingest_dataprep.pdf

2.29 MB
Binary file not shown.

tests/dataprep/ingest_dataprep.pptx

356 KB
Binary file not shown.

tests/dataprep/ingest_dataprep.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. FirstEnergy (NYSE:FE – Get Rating) posted its earnings results on Tuesday. The utilities provider reported $0.53 earnings per share for the quarter, topping the consensus estimate of $0.52 by $0.01, RTT News reports. FirstEnergy had a net margin of 10.85% and a return on equity of 17.17%. The Dáil was almost suspended on Thursday afternoon after Sinn Féin TD John Brady walked across the chamber and placed an on-call pager in front of the Minister for Housing Darragh O’Brien during a debate on retained firefighters. Mr O’Brien said Mr Brady had taken part in an act of theatre that was obviously choreographed.Around 2,000 retained firefighters around the country staged a second day of industrial action on Tuesday and are due to start all out-strike action from next Tuesday. The mostly part-time workers, who keep the services going outside of Ireland’s larger urban centres, are taking industrial action in a dispute over pay and working conditions. Speaking in the Dáil, Sinn Féin deputy leader Pearse Doherty said firefighters had marched on Leinster House today and were very angry at the fact the Government will not intervene. Reintroduction of tax relief on mortgages needs to be considered, O’Brien says. Martin withdraws comment after saying People Before Profit would ‘put the jackboot on people’ Taoiseach ‘propagated fears’ farmers forced to rewet land due to nature restoration law – Cairns An intervention is required now. I’m asking you to make an improved offer in relation to pay for retained firefighters, Mr Doherty told the housing minister.I’m also asking you, and challenging you, to go outside after this Order of Business and meet with the firefighters because they are just fed up to the hilt in relation to what you said.Some of them have handed in their pagers to members of the Opposition and have challenged you to wear the pager for the next number of weeks, put up with an €8,600 retainer and not leave your community for the two and a half kilometres and see how you can stand over those type of pay and conditions. At this point, Mr Brady got up from his seat, walked across the chamber and placed the pager on the desk in front of Mr O’Brien. Ceann Comhairle Seán Ó Fearghaíl said the Sinn Féin TD was completely out of order and told him not to carry out a charade in this House, adding it was absolutely outrageous behaviour and not to be encouraged.Mr O’Brien said Mr Brady had engaged in an act of theatre here today which was obviously choreographed and was then interrupted with shouts from the Opposition benches. Mr Ó Fearghaíl said he would suspend the House if this racket continues.Mr O’Brien later said he said he was confident the dispute could be resolved and he had immense regard for firefighters. The minister said he would encourage the unions to re-engage with the State’s industrial relations process while also accusing Sinn Féin of using the issue for their own political gain.

tests/dataprep/ingest_dataprep.xlsx

8.84 KB
Binary file not shown.

tests/dataprep/test_dataprep_milvus.sh

Lines changed: 27 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ DATAPREP_PORT=11101
1111
service_name="dataprep-milvus tei-embedding-serving etcd minio standalone"
1212
export TAG="comps"
1313

14+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
15+
source ${SCRIPT_DIR}/dataprep_utils.sh
16+
1417
function build_docker_images() {
1518
cd $WORKPATH
1619
echo $(pwd)
@@ -38,84 +41,37 @@ function start_service() {
3841
sleep 1m
3942
}
4043

41-
function validate_service() {
42-
local URL="$1"
43-
local EXPECTED_RESULT="$2"
44-
local SERVICE_NAME="$3"
45-
local DOCKER_NAME="$4"
46-
local INPUT_DATA="$5"
47-
48-
if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
49-
cd $LOG_PATH
50-
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
51-
elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
52-
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' -F 'chunk_size=400' "$URL")
53-
elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
54-
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
55-
elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
56-
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
57-
else
58-
HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
59-
fi
60-
HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
61-
RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
44+
function validate_microservice() {
45+
# test /v1/dataprep/ingest upload file
46+
ingest_doc ${ip_address} ${DATAPREP_PORT}
47+
check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
6248

63-
docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
49+
ingest_docx ${ip_address} ${DATAPREP_PORT}
50+
check_result "dataprep - upload - docx" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
6451

65-
# check response status
66-
if [ "$HTTP_STATUS" -ne "200" ]; then
67-
echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
52+
ingest_pdf ${ip_address} ${DATAPREP_PORT}
53+
check_result "dataprep - upload - pdf" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
6854

69-
if [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
70-
docker logs test-comps-dataprep-milvus-tei-server >> ${LOG_PATH}/tei-embedding.log
71-
fi
72-
exit 1
73-
else
74-
echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
75-
fi
76-
# check response body
77-
if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
78-
echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
79-
exit 1
80-
else
81-
echo "[ $SERVICE_NAME ] Content is as expected."
82-
fi
55+
ingest_pptx ${ip_address} ${DATAPREP_PORT}
56+
check_result "dataprep - upload - pptx" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
8357

84-
sleep 5s
85-
}
58+
ingest_txt ${ip_address} ${DATAPREP_PORT}
59+
check_result "dataprep - upload - txt" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
8660

87-
function validate_microservice() {
88-
cd $LOG_PATH
61+
ingest_xlsx ${ip_address} ${DATAPREP_PORT}
62+
check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
8963

90-
# test /v1/dataprep/delete
91-
validate_service \
92-
"http://${ip_address}:${DATAPREP_PORT}/v1/dataprep/delete" \
93-
'{"status":true}' \
94-
"dataprep_del" \
95-
"dataprep-milvus-server"
96-
97-
# test /v1/dataprep upload file
98-
echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
99-
validate_service \
100-
"http://${ip_address}:${DATAPREP_PORT}/v1/dataprep/ingest" \
101-
"Data preparation succeeded" \
102-
"dataprep_upload_file" \
103-
"dataprep-milvus-server"
104-
105-
# test /v1/dataprep upload link
106-
validate_service \
107-
"http://${ip_address}:${DATAPREP_PORT}/v1/dataprep/ingest" \
108-
"Data preparation succeeded" \
109-
"dataprep_upload_link" \
110-
"dataprep-milvus-server"
111-
112-
# test /v1/dataprep/get_file
113-
validate_service \
114-
"http://${ip_address}:${DATAPREP_PORT}/v1/dataprep/get" \
115-
'{"name":' \
116-
"dataprep_get" \
117-
"dataprep-milvus-server"
64+
# test /v1/dataprep/ingest upload link
65+
ingest_external_link ${ip_address} ${DATAPREP_PORT}
66+
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
67+
68+
# test /v1/dataprep/get
69+
get_all ${ip_address} ${DATAPREP_PORT}
70+
check_result "dataprep - get" '{"name":' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
11871

72+
# test /v1/dataprep/delete
73+
delete_all ${ip_address} ${DATAPREP_PORT}
74+
check_result "dataprep - del" '{"status":true}' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
11975
}
12076

12177
function stop_docker() {

0 commit comments

Comments
 (0)