Skip to content

Commit 0a07b93

Browse files
zhuhaozhejianan-gu
andauthored
saperate fully convergency test from training test (#581)
Co-authored-by: jianan-gu <[email protected]>
1 parent 0bded92 commit 0a07b93

File tree

4 files changed

+94
-12
lines changed

4 files changed

+94
-12
lines changed

models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,11 @@ def create_emb(self, m, ln, local_ln_emb=None):
217217
n = ln[i]
218218
else:
219219
n = ln[local_ln_emb[i]]
220-
EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
221-
# initialize embeddings
222-
if not args.inference_only:
223-
nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
220+
print("Create Embedding: {}".format(n), flush=True)
221+
W = np.random.uniform(
222+
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
223+
).astype(np.float32)
224+
EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True))
224225
emb_l.append(EE)
225226
return emb_l
226227

@@ -248,9 +249,10 @@ def __init__(
248249
self.local_ln_emb = self.ln_emb[self.local_ln_emb_slice]
249250
else:
250251
self.local_ln_emb = None
251-
self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb)
252+
self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb))
252253
self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
253254
self.top_l = self.create_mlp(ln_top, sigmoid_top)
255+
self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb)
254256
self.loss_fn = torch.nn.BCELoss(reduction="mean")
255257

256258

@@ -588,6 +590,10 @@ def inference(
588590
flush=True,
589591
)
590592
print("Accuracy: {:.34} ".format(validation_results["roc_auc"]))
593+
if not args.inference_only:
594+
if args.mlperf_auc_threshold != 0.0 and best_auc_test > args.mlperf_auc_threshold:
595+
print("Have reached the auc threshold:", args.mlperf_auc_threshold, ", stop training")
596+
exit()
591597
elif not args.inference_only:
592598
is_best = acc_test > best_acc_test
593599
if is_best:

quickstart/recommendation/pytorch/dlrm/training/cpu/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export OUTPUT_DIR=<directory where log files will be written>
5959
| Script name | Description |
6060
|-------------|-------------|
6161
| `training.sh` | Run training for the specified precision (fp32, avx-fp32, or bf16). |
62+
| `test_convergency.sh` | Run fully convergency test for the specified precision (fp32, avx-fp32, or bf16). |
6263
| `distribute_training.sh` | Run distribute training on 1 node with 2 sockets for the specified precision (fp32, avx-fp32, or bf16). |
6364

6465
## Run the model
@@ -91,7 +92,7 @@ NUM_BATCH=10000 bash training.sh
9192
NUM_BATCH=50000 bash training.sh
9293

9394
# Or, run quickstart script for testing fully convergency
94-
bash training.sh
95+
bash test_convergence.sh
9596

9697
# Run quickstart to distribute training dlrm on 2 sockets
9798
# Note, you need to follow [link](/docs/general/pytorch/BareMetalSetup.md) to install Torch-CCL and run this command on the machine which sockets larger than 2
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright (c) 2021 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
MODEL_DIR=${MODEL_DIR-$PWD}
17+
if [ ! -e "${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py" ]; then
18+
echo "Could not find the script of dlrm_s_pytorch.py. Please set environment variable '\${MODEL_DIR}'."
19+
echo "From which the dlrm_s_pytorch.py exist at the: \${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py"
20+
exit 1
21+
fi
22+
MODEL_SCRIPT=${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py
23+
24+
echo "PRECISION: ${PRECISION}"
25+
echo "DATASET_DIR: ${DATASET_DIR}"
26+
echo "OUTPUT_DIR: ${OUTPUT_DIR}"
27+
28+
if [ -z "${OUTPUT_DIR}" ]; then
29+
echo "The required environment variable OUTPUT_DIR has not been set"
30+
exit 1
31+
fi
32+
33+
if [ -z "${DATASET_DIR}" ]; then
34+
echo "The required environment variable DATASET_DIR has not been set"
35+
exit 1
36+
fi
37+
38+
if [ ! -d "${DATASET_DIR}" ]; then
39+
echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
40+
exit 1
41+
fi
42+
43+
44+
# Create the output directory in case it doesn't already exist
45+
mkdir -p ${OUTPUT_DIR}
46+
LOG=${OUTPUT_DIR}/dlrm_training_log/${PRECISION}
47+
rm -rf ${LOG}
48+
mkdir -p ${LOG}
49+
50+
if [[ "$PRECISION" == *"avx"* ]]; then
51+
unset DNNL_MAX_CPU_ISA
52+
fi
53+
54+
if [[ $PRECISION == "bf16" ]]; then
55+
ARGS="$ARGS --bf16"
56+
echo "running bf16 path"
57+
elif [[ $PRECISION == "fp32" || $PRECISION == "avx-fp32" ]]; then
58+
echo "running fp32 path"
59+
else
60+
echo "The specified PRECISION '${PRECISION}' is unsupported."
61+
echo "Supported PRECISIONs are: fp32, avx-fp32, bf16"
62+
exit 1
63+
fi
64+
65+
LOG_0="${LOG}/socket_0"
66+
python -m intel_extension_for_pytorch.cpu.launch --node_id=0 --enable_tcmalloc $MODEL_SCRIPT \
67+
--raw-data-file=${DATASET_DIR}/day --processed-data-file=${DATASET_DIR}/terabyte_processed.npz \
68+
--data-set=terabyte \
69+
--memory-map --mlperf-bin-loader --round-targets=True \
70+
--arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \
71+
--arch-sparse-feature-size=128 --max-ind-range=40000000 \
72+
--numpy-rand-seed=727 --print-auc --mlperf-auc-threshold=0.8025 \
73+
--mini-batch-size=32768 --print-freq=640 --print-time --ipex-interaction \
74+
--test-mini-batch-size=262144 --ipex-merged-emb \
75+
--lr-num-warmup-steps=8000 --lr-decay-start-step=70000 --lr-num-decay-steps=50000 \
76+
--learning-rate=18.0 --test-freq=6400 --should-test | tee $LOG_0

quickstart/recommendation/pytorch/dlrm/training/cpu/training.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,13 @@ if [ ! -d "${DATASET_DIR}" ]; then
4141
fi
4242

4343
export TEST_FULLY_CONVERGENCE=0
44-
if [[ "$NUM_BATCH" != "" ]]
44+
if [ -z "${NUM_BATCH}" ]; then
4545
then
46-
ARGS="$ARGS --num-batches=${NUM_BATCH}"
47-
echo "will early stop after ${NUM_BATCH} batches"
46+
echo "The required environment variable NUM_BATCH has not been set"
47+
exit 1
4848
else
49-
ARGS="$ARGS --lr-num-warmup-steps=8000 --lr-decay-start-step=70000 --lr-num-decay-steps=30000 --learning-rate=18.0 --should-test"
50-
echo "not set early stop interaction, will fully test convergence"
51-
TEST_FULLY_CONVERGENCE=1
49+
ARGS="$ARGS --num-batches=${NUM_BATCH}"
50+
echo "will train ${NUM_BATCH} batches"
5251
fi
5352

5453

0 commit comments

Comments
 (0)