Skip to content

Commit 5ef5fb4

Browse files
Add the DNN Model & WideAndDeep Model using census dataset for SQLFlow syntax discussion. (#1697)
* Add the DNN Model & WideAndDeep Model for census dataset for SQLFlow syntax discussion * Add the customized layer for GROUP * Remove unnecessary NUMERIC keyword * Remove unused imports * Resolve pre-commit issue * Resolve pre-commit error from isort * Auto update the code using pre-commit * Use Embedding(workclass, 16) instead of Embedding(Hash(workclass, 64), 16) * Add the metadata parsed from COLUMN clause in SQLFlow statement * Add the missed imports in wide_deep_functional.py * Fix the pre-commit check issue * Resolve pre-commit check issue * Resolve issues from pre-commit check * Make the imports in order * Resolve pre-commit check issue * Rename APPLY_VOCAB to VOCABULARIZE
1 parent 41ae333 commit 5ef5fb4

11 files changed

+874
-0
lines changed

model_zoo/census_model_sqlflow/__init__.py

Whitespace-only changes.

model_zoo/census_model_sqlflow/dnn/__init__.py

Whitespace-only changes.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
SELECT *
2+
FROM census_income
3+
TO TRAIN DNNClassifier
4+
WITH model.hidden_units = [10, 20]
5+
COLUMN (
6+
age,
7+
capital_gain,
8+
capital_loss,
9+
hours_per_week,
10+
EMBEDDING(workclass, 16),
11+
EMBEDDING(education, 16),
12+
EMBEDDING(martial_status, 16),
13+
EMBEDDING(occupation, 16),
14+
EMBEDDING(relationship, 16),
15+
EMBEDDING(race, 16),
16+
EMBEDDING(sex, 16),
17+
EMBEDDING(native_country, 16)
18+
)
19+
LABEL label
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import tensorflow as tf
2+
3+
CATEGORICAL_FEATURE_KEYS = [
4+
"workclass",
5+
"education",
6+
"marital-status",
7+
"occupation",
8+
"relationship",
9+
"race",
10+
"sex",
11+
"native-country",
12+
]
13+
NUMERIC_FEATURE_KEYS = [
14+
"age",
15+
"capital-gain",
16+
"capital-loss",
17+
"hours-per-week",
18+
]
19+
LABEL_KEY = "label"
20+
21+
22+
def get_feature_columns():
23+
feature_columns = []
24+
25+
for numeric_feature_key in NUMERIC_FEATURE_KEYS:
26+
numeric_feature = tf.feature_column.numeric_column(numeric_feature_key)
27+
feature_columns.append(numeric_feature)
28+
29+
for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
30+
embedding_feature = tf.feature_column.embedding_column(
31+
tf.feature_column.categorical_column_with_hash_bucket(
32+
categorical_feature_key, hash_bucket_size=64
33+
),
34+
dimension=16,
35+
)
36+
feature_columns.append(embedding_feature)
37+
38+
return feature_columns
39+
40+
41+
def get_feature_input_layers():
42+
feature_input_layers = {}
43+
44+
for numeric_feature_key in NUMERIC_FEATURE_KEYS:
45+
feature_input_layers[numeric_feature_key] = tf.keras.Input(
46+
shape=(1,), name=numeric_feature_key, dtype=tf.float32
47+
)
48+
49+
for categorical_feature_key in CATEGORICAL_FEATURE_KEYS:
50+
feature_input_layers[categorical_feature_key] = tf.keras.Input(
51+
shape=(1,), name=categorical_feature_key, dtype=tf.string
52+
)
53+
54+
return feature_input_layers
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import tensorflow as tf
2+
from tensorflow.python.keras.metrics import accuracy
3+
4+
from model_zoo.census_model_sqlflow.dnn.census_feature_column import (
5+
get_feature_columns,
6+
get_feature_input_layers,
7+
)
8+
9+
10+
# The model definition from model zoo
11+
# Input Params:
12+
# feature_columns: The feature column array.
13+
# It can be generated from `COLUMN` clause.
14+
# feature_input_layers: The input layers specify the feature inputs.
15+
def dnn_classifier(feature_columns, feature_input_layers):
16+
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
17+
x = feature_layer(feature_input_layers)
18+
x = tf.keras.layers.Dense(16, activation="relu")(x)
19+
x = tf.keras.layers.Dense(16, activation="relu")(x)
20+
y = tf.keras.layers.Dense(1, activation="sigmoid")(x)
21+
22+
model = tf.keras.Model(inputs=feature_input_layers, outputs=y)
23+
24+
return model
25+
26+
27+
# The entry point of the submitter program
28+
def custom_model():
29+
feature_columns = get_feature_columns()
30+
feature_input_layers = get_feature_input_layers()
31+
32+
return dnn_classifier(
33+
feature_columns=feature_columns,
34+
feature_input_layers=feature_input_layers,
35+
)
36+
37+
38+
def loss(labels, predictions):
39+
labels = tf.expand_dims(labels, axis=1)
40+
return tf.keras.losses.binary_crossentropy(labels, predictions)
41+
42+
43+
def optimizer():
44+
return tf.keras.optimizers.Adam()
45+
46+
47+
def eval_metrics_fn():
48+
return {
49+
"accuracy": lambda labels, predictions: accuracy(
50+
tf.cast(tf.squeeze(tf.round(predictions)), tf.int32),
51+
tf.cast(labels, tf.int32),
52+
)
53+
}
54+
55+
56+
CATEGORICAL_FEATURE_KEYS = [
57+
"workclass",
58+
"education",
59+
"marital-status",
60+
"occupation",
61+
"relationship",
62+
"race",
63+
"sex",
64+
"native-country",
65+
]
66+
NUMERIC_FEATURE_KEYS = [
67+
"age",
68+
"capital-gain",
69+
"capital-loss",
70+
"hours-per-week",
71+
]
72+
LABEL_KEY = "label"
73+
74+
75+
# TODO: The dataset_fn and the column names above is bound with
76+
# the input data source. We can consider move it out of the
77+
# model definition file. Currently ElasticDL framework has the
78+
# limitation that the dataset_fn is in the same file with model def.
79+
def dataset_fn(dataset, mode, _):
80+
def _parse_data(record):
81+
82+
feature_description = dict(
83+
[
84+
(name, tf.io.FixedLenFeature([], tf.string))
85+
for name in CATEGORICAL_FEATURE_KEYS
86+
]
87+
+ [
88+
(name, tf.io.FixedLenFeature([], tf.float32))
89+
for name in NUMERIC_FEATURE_KEYS
90+
]
91+
+ [(LABEL_KEY, tf.io.FixedLenFeature([], tf.int64))]
92+
)
93+
94+
parsed_record = tf.io.parse_single_example(record, feature_description)
95+
label = parsed_record.pop(LABEL_KEY)
96+
97+
return parsed_record, label
98+
99+
dataset = dataset.map(_parse_data)
100+
101+
return dataset

model_zoo/census_model_sqlflow/wide_and_deep/__init__.py

Whitespace-only changes.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
SELECT *
2+
FROM census_income
3+
TO TRAIN WideAndDeepClassifier
4+
COLUMN (
5+
SET GROUP(VOCABULARIZE(workclass), BUCKETIZE(capital_gain, bucket_num=5), BUCKETIZE(capital_loss, bucket_num=5), BUCKTIZE(hours_per_week, bucket_num=6)) AS group_1,
6+
SET GROUP(HASH(education), HASH(occupation), VOCABULARIZE(martial_status), VOCABULARIZE(relationship)) AS group_2,
7+
SET GROUP(BUCKETIZE(age, bucket_num=5), HASH(native_country), VOCABULARIZE(race), VOCABULARIZE(sex)) AS group_3,
8+
9+
[EMBEDDING(group1, 1), EMBEDDING(group2, 1)] AS wide_embeddings
10+
[EMBEDDING(group1, 8), EMBEDDING(group2, 8), EMBEDDING(group3, 8)] AS deep_embeddings
11+
)
12+
LABEL label

0 commit comments

Comments
 (0)