saperate fully convergency test from training test (#581)

zhuhaozhe · jianan-gu · web-flow · commit 0a07b93d97f7 · 2022-05-20T10:31:53.000+08:00
Co-authored-by: jianan-gu &lt;jianan.gu@intel.com&gt;
diff --git a/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py b/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py
@@ -217,10 +217,11 @@ def create_emb(self, m, ln, local_ln_emb=None):
                 n = ln[i]
             else:
                 n = ln[local_ln_emb[i]]
-            EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
-            # initialize embeddings
-            if not args.inference_only:
-                nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+            print("Create Embedding: {}".format(n), flush=True)
+            W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+                ).astype(np.float32)
+            EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True))
             emb_l.append(EE)
         return emb_l
 
@@ -248,9 +249,10 @@ def __init__(
             self.local_ln_emb = self.ln_emb[self.local_ln_emb_slice]
         else:
             self.local_ln_emb = None
-        self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb)
+        self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb))
         self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
         self.top_l = self.create_mlp(ln_top, sigmoid_top)
+        self.emb_l = self.create_emb(m_spa, ln_emb, self.local_ln_emb)
         self.loss_fn = torch.nn.BCELoss(reduction="mean")
 
 
@@ -588,6 +590,10 @@ def inference(
             flush=True,
         )
         print("Accuracy: {:.34} ".format(validation_results["roc_auc"]))
+        if not args.inference_only:
+            if args.mlperf_auc_threshold != 0.0 and best_auc_test > args.mlperf_auc_threshold:
+                print("Have reached the auc threshold:", args.mlperf_auc_threshold, ", stop training")
+                exit()
     elif not args.inference_only:
         is_best = acc_test > best_acc_test
         if is_best:
diff --git a/quickstart/recommendation/pytorch/dlrm/training/cpu/README.md b/quickstart/recommendation/pytorch/dlrm/training/cpu/README.md
@@ -59,6 +59,7 @@ export OUTPUT_DIR=<directory where log files will be written>
 | Script name | Description |
 |-------------|-------------|
 | `training.sh` | Run training for the specified precision (fp32, avx-fp32, or bf16). |
+| `test_convergency.sh` | Run fully convergency test for the specified precision (fp32, avx-fp32, or bf16). |
 | `distribute_training.sh` | Run distribute training on 1 node with 2 sockets for the specified precision (fp32, avx-fp32, or bf16). |
 
 ## Run the model
@@ -91,7 +92,7 @@ NUM_BATCH=10000 bash training.sh
 NUM_BATCH=50000 bash training.sh
 
 # Or, run quickstart script for testing fully convergency
-bash training.sh
+bash test_convergence.sh
 
 # Run quickstart to distribute training dlrm on 2 sockets
 # Note, you need to follow [link](/docs/general/pytorch/BareMetalSetup.md) to install Torch-CCL and run this command on the machine which sockets larger than 2
diff --git a/quickstart/recommendation/pytorch/dlrm/training/cpu/test_convergency.sh b/quickstart/recommendation/pytorch/dlrm/training/cpu/test_convergency.sh
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+if [ ! -e "${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py"  ]; then
+    echo "Could not find the script of dlrm_s_pytorch.py. Please set environment variable '\${MODEL_DIR}'."
+    echo "From which the dlrm_s_pytorch.py exist at the: \${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py"
+    exit 1
+fi
+MODEL_SCRIPT=${MODEL_DIR}/models/recommendation/pytorch/dlrm/product/dlrm_s_pytorch.py
+
+echo "PRECISION: ${PRECISION}"
+echo "DATASET_DIR: ${DATASET_DIR}"
+echo "OUTPUT_DIR: ${OUTPUT_DIR}"
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+if [ -z "${DATASET_DIR}" ]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+ 
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+LOG=${OUTPUT_DIR}/dlrm_training_log/${PRECISION}
+rm -rf ${LOG}
+mkdir -p ${LOG}
+
+if [[ "$PRECISION" == *"avx"* ]]; then
+    unset DNNL_MAX_CPU_ISA
+fi
+
+if [[ $PRECISION == "bf16" ]]; then
+    ARGS="$ARGS --bf16"
+    echo "running bf16 path"
+elif [[ $PRECISION == "fp32" || $PRECISION == "avx-fp32" ]]; then
+    echo "running fp32 path"
+else
+    echo "The specified PRECISION '${PRECISION}' is unsupported."
+    echo "Supported PRECISIONs are: fp32, avx-fp32, bf16"
+    exit 1
+fi
+
+LOG_0="${LOG}/socket_0"
+python -m intel_extension_for_pytorch.cpu.launch --node_id=0 --enable_tcmalloc $MODEL_SCRIPT \
+  --raw-data-file=${DATASET_DIR}/day --processed-data-file=${DATASET_DIR}/terabyte_processed.npz \
+  --data-set=terabyte \
+  --memory-map --mlperf-bin-loader --round-targets=True \
+  --arch-mlp-bot=13-512-256-128 --arch-mlp-top=1024-1024-512-256-1 \
+  --arch-sparse-feature-size=128 --max-ind-range=40000000 \
+  --numpy-rand-seed=727 --print-auc --mlperf-auc-threshold=0.8025 \
+  --mini-batch-size=32768 --print-freq=640 --print-time --ipex-interaction \
+  --test-mini-batch-size=262144 --ipex-merged-emb \
+  --lr-num-warmup-steps=8000 --lr-decay-start-step=70000 --lr-num-decay-steps=50000 \
+  --learning-rate=18.0 --test-freq=6400 --should-test | tee $LOG_0
diff --git a/quickstart/recommendation/pytorch/dlrm/training/cpu/training.sh b/quickstart/recommendation/pytorch/dlrm/training/cpu/training.sh
@@ -41,14 +41,13 @@ if [ ! -d "${DATASET_DIR}" ]; then
 fi
 
 export TEST_FULLY_CONVERGENCE=0
-if [[ "$NUM_BATCH" != "" ]]
+if [ -z "${NUM_BATCH}" ]; then
 then
-    ARGS="$ARGS --num-batches=${NUM_BATCH}"
-    echo "will early stop after ${NUM_BATCH} batches"
+  echo "The required environment variable NUM_BATCH has not been set"
+  exit 1
 else
-    ARGS="$ARGS --lr-num-warmup-steps=8000 --lr-decay-start-step=70000 --lr-num-decay-steps=30000 --learning-rate=18.0 --should-test"
-    echo "not set early stop interaction, will fully test convergence"
-    TEST_FULLY_CONVERGENCE=1
+  ARGS="$ARGS --num-batches=${NUM_BATCH}"
+  echo "will train ${NUM_BATCH} batches"
 fi