diff --git a/mesh_tensorflow/beam_search.py b/mesh_tensorflow/beam_search.py
index e7173055..6d9f0421 100644
--- a/mesh_tensorflow/beam_search.py
+++ b/mesh_tensorflow/beam_search.py
@@ -121,7 +121,7 @@ def beam_search(logits_fn,
 
   num_prefilter is a theoretically lossy shortcut around slow performance of
   top_k on TPU on large Tensors and large k.  This option should be removed once
-  better top_k implementations on TPU are avialable.  If num_prefilter is set to
+  better top_k implementations on TPU are available.  If num_prefilter is set to
   a nonzero value, then at each step we first compute the top num_prefilter
   sequences per beam and then compute the top k sequences overall from among
   those.  Empirically, there seems to be no quality difference in setting
@@ -321,7 +321,7 @@ def grow_topk(i, alive_seq, alive_log_probs, states=None):
       top_scores = mtf.reshape(top_scores, [batch_dim, combined])
       top_minor_vocab_ids = mtf.reshape(
           top_minor_vocab_ids, [batch_dim, combined])
-      # shpae = [batch_dim, double_beam]
+      # shape = [batch_dim, double_beam]
       # ids are indices representing (beam, major_vocab, prefilter)
       top_scores, top_combined_ids = mtf.top_k(
           top_scores, reduced_dim=combined, k_dim=double_beam)
@@ -585,7 +585,7 @@ def greedy_decode(logits_fn,
 
   Args:
     logits_fn: Interface to the model, to provide logits.
-        Shoud take:
+        Should take:
           step_num - mtf Scalar
           ids - mtf Tensor with shape [..., length]
           states - list of mtf.Tensor
diff --git a/mesh_tensorflow/experimental/input_reader.py b/mesh_tensorflow/experimental/input_reader.py
index 815750b5..8559e9d1 100644
--- a/mesh_tensorflow/experimental/input_reader.py
+++ b/mesh_tensorflow/experimental/input_reader.py
@@ -277,7 +277,7 @@ def __init__(self,
         be used once and only once.
     Note:
       1. The efficiency is optimized according to the shape of the 0-th tensor:
-         mtf_input_shapes[0]. We recommand you to put the largest tensor as the
+         mtf_input_shapes[0]. We recommend you to put the largest tensor as the
          0-th input.
       2. You need to call start_infeed_thread() before your train ops.
     Example:
diff --git a/mesh_tensorflow/layers.py b/mesh_tensorflow/layers.py
index 5705ecfc..1bd1b7c6 100644
--- a/mesh_tensorflow/layers.py
+++ b/mesh_tensorflow/layers.py
@@ -37,7 +37,7 @@ def unit_scaling_convention(value=False):
   variance, and the outputs of most contractions (matmul/einsum operations) are
   divided by the square-root of the sizes of the contracting dimensions.
 
-  This differs from the typical inverse-square-root weight-initalization
+  This differs from the typical inverse-square-root weight-initialization
   convention often attributed to
   http://proceedings.mlr.press/v9/glorot10a.html
   in which weights are typically initialized according to a distribution with
@@ -402,7 +402,7 @@ def _depthwise_conv1d_hack(x,
     max_relative_pos: int, max relative position,
     name: str, variable_scope name,
     use_bias: Bool, whether to use bias,
-    initializer_scale: int, initalizer scale,
+    initializer_scale: int, initializer scale,
     kernel_depth_weights: an optional list of kernel weight tensors. The list
     contains one element for each relative position in the kernel. Each element
     has a width equal to the depth over which the separable conv operation is
@@ -550,7 +550,7 @@ def conv2d_with_blocks(
     padding: string, "SAME". The type of padding algorithm to use.
         "Valid" is not currently supported.
     h_blocks_dim: Dimension representing number of height blocks.
-    w_blocks_dim: Dimension representing number of witdh blocks.
+    w_blocks_dim: Dimension representing number of width blocks.
     filter_initializer: the initializer for tf.get_variable.
     variable_dtype: a mtf.VariableDType
     name: a name for the operation (optional).
diff --git a/mesh_tensorflow/ops.py b/mesh_tensorflow/ops.py
index 32c0897e..e7a976fd 100644
--- a/mesh_tensorflow/ops.py
+++ b/mesh_tensorflow/ops.py
@@ -448,11 +448,11 @@ def rewrite_stack_variables(self,
     and large numbers of variables.
 
     This function should be called after graph construction  (it is called by
-    default in the Lowering constuctor).
+    default in the Lowering constructor).
 
     When we find a set of variables with the same shape/dtype/etc, we replace
     them with one StackedVariable and an "unstack" operation.  The
-    StackedVariable has multiple master variables (so as to maintain checkpiont
+    StackedVariable has multiple master variables (so as to maintain checkpoint
     compatibility), but only one slice variable per device.  We point the inputs
     of later operations to the outputs of the "unstack" operations, instead of
     the outputs of the defunct single variables.
@@ -477,7 +477,7 @@ def rewrite_stack_variables(self,
     self._operations = []
     self._all_variables = []
     self._trainable_variables = []
-    # We can only stack varaibles which share the same set of assignment
+    # We can only stack variables which share the same set of assignment
     # operations.
     var_to_assign_ops = collections.defaultdict(str)
     for op in operations:
@@ -608,7 +608,7 @@ def make_variables_untrainable(self, variables):
   def clone_operations(self, ops, input_mapping):
     """Clone a portion of the graph, but with different inputs.
 
-    The differnt inputs are specified by the `input_mapping` dictionary, which
+    The different inputs are specified by the `input_mapping` dictionary, which
     maps from input Tensor in the original operations to input Tensor in the
     cloned operations.  If an original operation uses an external input that is
     not in `input_mapping`, then the original input is used for the cloned
@@ -1033,7 +1033,7 @@ def slicewise_delay_allreduce(self, fn, *inputs):
     Args:
       fn: function from tf.Tensors to tf.Tensor or a tuple of tf.Tensors.
       *inputs: list of inputs.  Each input is either a LaidOutTensor or
-        has a to_laid_out_tensor method or is convertibleto a tf.Tensor.
+        has a to_laid_out_tensor method or is convertible to a tf.Tensor.
 
     Returns:
       LaidOutTensor or LazyAllreduceSum
@@ -2057,7 +2057,7 @@ def __init__(self, tf_fn, x1, x2, output_shape, output_dtype, name=None):
         self._initialize_all_dimensions_as_splittable())
 
   def gradient(self, unused_grad_ys):
-    raise ValueError("Gradient not implememnted")
+    raise ValueError("Gradient not implemented")
 
   def lower(self, lowering):
     x1 = self.inputs[0]
diff --git a/mesh_tensorflow/transformer/attention.py b/mesh_tensorflow/transformer/attention.py
index 115ca3ab..d063db2d 100644
--- a/mesh_tensorflow/transformer/attention.py
+++ b/mesh_tensorflow/transformer/attention.py
@@ -730,7 +730,7 @@ def local_attention_1d(q,
                        write_priority=None,
                        read_priority=None,
                        attention_kwargs=None):
-  """Attention to the a neighborood around the source.
+  """Attention to the a neighborhood around the source.
 
   If fully_autoregressive, then query position p can only see memory positions
   in the range (p - radius, p].
@@ -846,7 +846,7 @@ def _maybe_reshape_attention_input_for_2d_sharding(
   this function reshapes the attention inputs to remove the unnecessary
   replication.
 
-  This becomes relevent when doing 2-dimenional model parallelism.
+  This becomes relevent when doing 2-dimensional model parallelism.
   d_model is sharded over one mesh dimension and [vocab, num_heads, d_ff] are
   sharded over the other mesh dimension.  This fully distributes all of the
   einsum operations, except for the internals of the attention computation.
diff --git a/mesh_tensorflow/transformer/dataset.py b/mesh_tensorflow/transformer/dataset.py
index 2ea75879..bcee5a93 100644
--- a/mesh_tensorflow/transformer/dataset.py
+++ b/mesh_tensorflow/transformer/dataset.py
@@ -724,7 +724,7 @@ def trim_and_pad_dataset(dataset, length, feature_keys=None):
   """Trim and pad first dimension of features to size `length`.
 
   Args:
-    dataset: tf.data.Dataset, the dataset to trimp/pad examples in.
+    dataset: tf.data.Dataset, the dataset to trim/pad examples in.
     length: int, or a dict from feature-key to int
     feature_keys: (optional) collection of strings, the feature names to limit
       trimming/padding to. Defaults to all features.
diff --git a/mesh_tensorflow/transformer/transformer.py b/mesh_tensorflow/transformer/transformer.py
index aa63046d..03c4ba87 100644
--- a/mesh_tensorflow/transformer/transformer.py
+++ b/mesh_tensorflow/transformer/transformer.py
@@ -15,7 +15,7 @@
 
 """MTF implementation of Transformer sequence/seq2seq model.
 
-This implmentation is meant to be extensible, allowing users to define their
+This implementation is meant to be extensible, allowing users to define their
 own custom layers.  It is meant to eventually replace the existing
 mesh-tensorflow Transformer implementation in the Tensor2Tensor library.
 
@@ -357,7 +357,7 @@ def __init__(self,
     functions.
 
     `dropout_rate` and `norm_epsilon` should only be specified in a legacy mode,
-    for compatiblity with older checkpoints.
+    for compatibility with older checkpoints.
 
     Args:
       layers: a list of TransformerLayer
@@ -471,7 +471,7 @@ def sublayer_mask_padding(x, layer_stack, context):
 
   This "fixes" a bug where extreme values leak from the padding into the
   non-padding regions.
-  TODO(noam): undertand this better and make a more principled fix.
+  TODO(noam): understand this better and make a more principled fix.
 
   Args:
     x: a Tensor
@@ -694,9 +694,9 @@ def __init__(self,
         loss for the inputs portion of the example.
       loss_denominator: an optional float.  The default behavior is to
         compute the mean loss across all tokens in the batch, making the
-        denomiator the size of the targets tensor (omitting ensemble
+        denominator the size of the targets tensor (omitting ensemble
         dimensions).
-        Passing a float here provides an alternative denomiator.
+        Passing a float here provides an alternative denominator.
         One use case is that when fine-tuning a model using a much smaller
         batch size than the original training batch, one might want to use the
         same denominator as was used for the pretraining.  This complication
@@ -834,7 +834,7 @@ def _call_internal(self, context, inputs, targets=None):
       if (context.length_dim is not None and
           context.length_dim.size > self.max_length_dim.size):
         message = (
-            "Length dimenison exceeds size of positional embedding table. "
+            "Length dimension exceeds size of positional embedding table. "
             "length_dim.size > max_length_dim.size %s vs %s."
             % (context.length_dim, self.max_length_dim))
         if context.position_is_default:
@@ -878,7 +878,7 @@ def loss_denominator(self, targets, num_microbatches):
     """Denominator applied to losses.
 
     This is usually the size of the targets tensor (omitting ensemble
-    dimensions).  Alternitively, it is an override value passed to the
+    dimensions).  Alternatively, it is an override value passed to the
     class constructor.
 
     Args:
@@ -1457,7 +1457,7 @@ def call_simple(self,
       logits: a Tensor with shape [<batch_dims>, output_vocab_dim]
       loss: an optional Scalar (if compute_loss=True)
     """
-    # encoder_sequene_id and decoder_sequence_id are used to delineate packed
+    # encoder_sequence_id and decoder_sequence_id are used to delineate packed
     # examples but are also necessary to indicate padding where sequence_id==0.
     # If they are absent, then we assume that padding is indicated by zeros in
     # the inputs/targets, and we make up sequence_id tensors to indicate this.
diff --git a/mesh_tensorflow/transformer/utils.py b/mesh_tensorflow/transformer/utils.py
index e2c771d8..5dcc74be 100644
--- a/mesh_tensorflow/transformer/utils.py
+++ b/mesh_tensorflow/transformer/utils.py
@@ -219,7 +219,7 @@ def tpu_mesh_shape(tpu_topology=gin.REQUIRED,
   """Create a mesh_shape for data-parallelism and model-parallelism on TPU.
 
   Example: tpu_mesh_shape("4x4", 8) -> mtf.Shape(("batch", 4), ("model", 8))
-  Since there are 4x4x2=32 total cores, and we want 8-way model paralleism.
+  Since there are 4x4x2=32 total cores, and we want 8-way model parallelism.
 
   This function is passed through gin to the argument `mesh_shape` inside the
   function `run`.
@@ -1011,7 +1011,7 @@ def encode_delimited_lm(inputs,
                         sequence_length,
                         eos_id=1,
                         include_final_eos=True):
-  """Encode inputs and targets for scoring a delimited langauge model.
+  """Encode inputs and targets for scoring a delimited language model.
 
   Args:
     inputs: list of strings
@@ -1260,7 +1260,7 @@ def score_from_strings(estimator, vocabulary, model_type, batch_size,
   evenly divide targets N times, where each input has N decodes sequentially
   in targets.
 
-  The function returns a list of floats represnenting the log-liekelihood of the
+  The function returns a list of floats representing the log-likelihood of the
   target given the input.  If `scores_filename` is present, then these are also
   written out as a text file, one per line.
 
@@ -1340,7 +1340,7 @@ def score_from_dataset(estimator, vocabulary, batch_size, sequence_length,
 
 
 
-  The function returns a list of floats represnenting the log-liekelihood of the
+  The function returns a list of floats representing the log-likelihood of the
   target given the input.  If `scores_filename` is present, then these are also
   written out as a text file, one per line.
 
@@ -1435,7 +1435,7 @@ def get_estimator(model_type, vocabulary, mesh_shape,
     use_tpu: string, the Cloud TPU to use for training
     tpu_job_name: string, name of TPU worker binary
     iterations_per_loop: integer, steps per train loop
-    cluster: a TPUClsuterResolver object
+    cluster: a TPUClusterResolver object
     init_checkpoint: a string, if not None then read in variables from this
       checkpoint path when initializing variables. Will only initialize
       variables that appear both in the current graph and the checkpoint.
@@ -2089,7 +2089,7 @@ def run(tpu_job_name,
     learning_rate_schedule: a function which takes the scalar name argument
       `step` and the numeric argument `total_train_steps` and returns the scalar
       learning rate.  Alternatively a float.  Alternatively, a list of
-      such factos to be multiplied together.
+      such factors to be multiplied together.
     optimizer: a class extending optimize.Optimizer, required for training
     predict_fn: an optional function, see `get_estimator` docstring for details.
     variable_filter: a string, see `get_estimator` docstring for details.