You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Debian
TensorFlow version and how it was installed (source or binary): 2.15.1
TensorFlow-Recommenders-Addons version and how it was installed (source or binary): source (master)
Python version: 3.10
Is GPU used? (yes/no): no
Describe the bug
I am using dynamic embeddings with a parameter server strategy, after updating embedding weights/params with optimizer.apply_gradients model variable DynamicEmbedding/my_embedding_layer-shadow:0 : Tensor had NaN values (from tf.debugging.check_numerics
Code to reproduce the issue
I have provided a self contained MWE, which uses an in-process cluster to simulate PS training. The model is simple, the data is random.
import os
import multiprocessing
import portpicker
import json
# TFRA does some patching on TensorFlow so it MUST be imported after importing TF
import tensorflow as tf
import tensorflow_recommenders_addons.dynamic_embedding as de
BATCH_SIZE = 1
NUM_WORKERS = 2
NUM_PS = 2
LOG_EVERY_N = 2
def create_in_process_cluster():
"""Creates and starts local servers and sets tf_config in the environment."""
worker_ports = [portpicker.pick_unused_port() for _ in range(NUM_WORKERS)]
ps_ports = [portpicker.pick_unused_port() for _ in range(NUM_PS)]
cluster_dict = {}
cluster_dict["worker"] = ["localhost:%s" % port for port in worker_ports]
if NUM_PS > 0:
cluster_dict["ps"] = ["localhost:%s" % port for port in ps_ports]
cluster_spec = tf.train.ClusterSpec(cluster_dict)
worker_config = tf.compat.v1.ConfigProto()
if multiprocessing.cpu_count() < NUM_WORKERS + 1:
worker_config.inter_op_parallelism_threads = NUM_WORKERS + 1
worker_config.intra_op_parallelism_threads = NUM_WORKERS + 1
for i in range(NUM_WORKERS):
tf.distribute.Server(
cluster_spec,
job_name="worker",
task_index=i,
config=worker_config,
protocol="grpc",
)
ps_config = tf.compat.v1.ConfigProto()
if multiprocessing.cpu_count() < NUM_PS + 1:
ps_config.inter_op_parallelism_threads = NUM_PS + 1
ps_config.intra_op_parallelism_threads = NUM_PS + 1
for i in range(NUM_PS):
tf.distribute.Server(
cluster_spec, job_name="ps", task_index=i, protocol="grpc", config=ps_config
)
chief_port = portpicker.pick_unused_port()
cluster_dict["chief"] = [f"localhost:{chief_port}"]
tf_config = {"cluster": cluster_dict, "task": {"type": "chief", "index": 0}}
os.environ["TF_CONFIG"] = json.dumps(tf_config)
return tf_config
class TestModel(tf.keras.Model):
def __init__(self):
super(TestModel, self).__init__()
self.gate = tf.keras.Sequential(
[
tf.keras.layers.Dense(
3,
use_bias=False,
activation="softmax",
name=f"gate",
),
tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1)),
]
)
self.gate_mult = tf.keras.layers.Lambda(
lambda x: tf.reduce_sum(x[0] * x[1], axis=1, keepdims=False)
)
self.emb = de.keras.layers.embedding.Embedding(
name="my_embedding_layer",
embedding_size=4,
devices=[
"/job:ps/replica:0/task:{}/device:CPU:0".format(idx)
for idx in range(NUM_PS)
],
distribute_strategy=tf.distribute.get_strategy(),
with_unique=False,
init_capacity=1,
)
self.dense = tf.keras.layers.Dense(1, activation="sigmoid")
def call(self, x):
embedding = self.emb(x)
gate = self.gate(x)
gate_mul = self.gate_mult([gate, embedding])
output = self.dense(gate_mul)
return output
def compute_loss(self, inputs, training: bool = False) -> tf.Tensor:
data, targets = inputs
outputs = self(data)
loss = tf.keras.losses.BinaryCrossentropy(
from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)(
tf.random.uniform((BATCH_SIZE, 1), minval=0, maxval=1, dtype=tf.int64),
outputs,
)
return loss
def create_coordinator():
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
min_shard_bytes = 256 << 10
max_shards = NUM_PS
variable_partitioner = tf.distribute.experimental.partitioners.MinSizePartitioner(
min_shard_bytes=min_shard_bytes, max_shards=max_shards
)
strategy = tf.distribute.ParameterServerStrategy(
resolver, variable_partitioner=variable_partitioner
)
coordinator = tf.distribute.coordinator.ClusterCoordinator(strategy)
return coordinator
def launch_training():
# This is run on chief which is the process that launches this
coordinator = create_coordinator()
with coordinator.strategy.scope():
model = TestModel()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer = de.DynamicEmbeddingOptimizer(optimizer)
strategy = coordinator.strategy
steps_per_invocation = 2
@tf.function
def worker_train_step():
all_losses = []
for i in range(steps_per_invocation):
def per_replica_step(data, targets):
with tf.GradientTape() as tape:
per_example_loss = model.compute_loss(
(data, targets), training=True
)
for var in model.trainable_variables:
tf.debugging.check_numerics(
var, message=f"Pre Update Variable check failed {var.name}"
)
loss = tf.nn.compute_average_loss(per_example_loss)
gradients = tape.gradient(
loss,
model.trainable_variables,
)
for grad in gradients:
tf.debugging.check_numerics(
grad, message="Gradient check failed"
)
optimizer.apply_gradients(
zip(
gradients,
model.trainable_variables,
)
)
for var in model.trainable_variables:
tf.debugging.check_numerics(
var, message=f"Post Update Variable check failed {var.name}"
)
for var in optimizer.variables():
if var.dtype in [tf.float16, tf.float32, tf.float64, tf.bfloat16]:
tf.debugging.check_numerics(
var, message="Optimizer variable check failed"
)
return loss
data, target = (
tf.random.uniform(
(BATCH_SIZE, 1), minval=0, maxval=10000, dtype=tf.int64
),
tf.random.uniform((BATCH_SIZE, 1), minval=0, maxval=1, dtype=tf.int64),
)
all_losses.append(strategy.run(per_replica_step, args=(data, target)))
return strategy.reduce(tf.distribute.ReduceOp.MEAN, all_losses, axis=None)
num_train_steps = 10000
total_steps_to_schedule = max(num_train_steps // steps_per_invocation, 1)
losses = []
for i in range(1, total_steps_to_schedule + 1):
losses.append(coordinator.schedule(worker_train_step))
if i % LOG_EVERY_N == 0:
coordinator.join()
total_steps = steps_per_invocation * i
avg_loss = tf.math.reduce_mean([loss.fetch() for loss in losses])
print(
f"avg loss {avg_loss} on step {i}, done a total of {steps_per_invocation} steps each step and its been, "
f"{i} steps so, a total of {total_steps} of batch size"
f" {BATCH_SIZE}, "
)
losses = []
coordinator.join()
if __name__ == "__main__":
_ = create_in_process_cluster()
launch_training()
Other info / logs
Im not sure but it seems I can only repro this if I set LOG_EVERY_N > 1, which means that coordinator.join() is not called every step so updates are happening asynchronously.
Log output from tf.debugging.check_numerics:
(0) INVALID_ARGUMENT: Post Update Variable check failed DynamicEmbedding/my_embedding_layer-shadow:0 : Tensor had NaN values
[[{{node CheckNumerics_9}}]]
The text was updated successfully, but these errors were encountered:
System information
Describe the bug
I am using dynamic embeddings with a parameter server strategy, after updating embedding weights/params with
optimizer.apply_gradients
model variableDynamicEmbedding/my_embedding_layer-shadow:0 : Tensor had NaN values
(fromtf.debugging.check_numerics
Code to reproduce the issue
I have provided a self contained MWE, which uses an in-process cluster to simulate PS training. The model is simple, the data is random.
Other info / logs
Im not sure but it seems I can only repro this if I set
LOG_EVERY_N
> 1, which means thatcoordinator.join()
is not called every step so updates are happening asynchronously.Log output from
tf.debugging.check_numerics
:The text was updated successfully, but these errors were encountered: