Migrate from std::string to tensorflow::tstring.

lingvo-bot · copybara-github · commit b60509fa6585 · 2019-11-25T21:57:52.000-08:00
Note that during the transition period tstring is typedef'ed to std::string. See: tensorflow/community#91 PiperOrigin-RevId: 282495664
diff --git a/lingvo/core/ops/beam_search_step_op_kernels.cc b/lingvo/core/ops/beam_search_step_op_kernels.cc
@@ -301,7 +301,7 @@ class BeamSearchStepOp : public OpKernel {
                  input_data.size());
         }
       } else if (input.dtype() == DT_STRING) {
-        output->flat<string>() = input.flat<string>();
+        output->flat<tstring>() = input.flat<tstring>();
       }
     }
     return output;
@@ -573,7 +573,7 @@ class BeamSearchStepOp : public OpKernel {
     auto t_out_scores = out_scores->matrix<float>();
     auto t_out_hyps = out_hyps->matrix<int>();
     auto t_out_prev_hyps = out_prev_hyps->matrix<int>();
-    auto t_out_done_hyps = out_done_hyps->matrix<string>();
+    auto t_out_done_hyps = out_done_hyps->matrix<tstring>();
     auto t_out_atten_probs = out_atten_probs->tensor<float, 3>();
     auto t_all_done = all_done->scalar<bool>();
 
@@ -700,7 +700,7 @@ class TopKTerminatedHypsOp : public OpKernel {
                      k, /* unused epsilon id */ -1));
     // Each mutex is used to protect corresponding topk_vec.
     std::vector<mutex> mu_vec(num_beams);
-    auto t_done_hyps = in_done_hyps.matrix<string>();
+    auto t_done_hyps = in_done_hyps.matrix<tstring>();
     // The thread sharding is along hyps_size.
     Shard(kNumWorkers, workers, hyps_size, 1000 * num_steps,
           [&](int64 start, int64 limit) {
@@ -731,7 +731,7 @@ class TopKTerminatedHypsOp : public OpKernel {
             }
           });
 
-    auto t_topk_hyps = topk_hyps->matrix<string>();
+    auto t_topk_hyps = topk_hyps->matrix<tstring>();
     for (int i = 0; i < num_beams; ++i) {
       auto ith_topk = topk_vec[i].Get();
       CHECK_LE(ith_topk.size(), k);
@@ -836,7 +836,7 @@ class UnpackHypOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in_hyps = ctx->input(0);
-    const auto& t_in_hyps = in_hyps.flat<string>();
+    const auto& t_in_hyps = in_hyps.flat<tstring>();
     const int batch_size = t_in_hyps.size();
     std::vector<Hypothesis> hyps(batch_size);
     for (int i = 0; i < batch_size; ++i) {
@@ -991,7 +991,7 @@ class HypsFromBeamSearchOuts : public OpKernel {
 
     Tensor* out_hyps;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, hyps.shape(), &out_hyps));
-    auto out_hyps_t = out_hyps->matrix<string>();
+    auto out_hyps_t = out_hyps->matrix<tstring>();
 
     // Use the same thread pool as topk operator.
     static thread::ThreadPool* workers =
diff --git a/lingvo/core/ops/best_step_op_kernels.cc b/lingvo/core/ops/best_step_op_kernels.cc
@@ -52,10 +52,11 @@ class BestStepOp : public OpKernel {
       ::std::unique_ptr<RecordReader> reader(new RecordReader(file.get()));
 
       uint64 offset = 0;
-      string raw_proto;
+      tstring raw_proto;
       while (reader->ReadRecord(&offset, &raw_proto).ok()) {
         Event event;
-        CHECK(::tensorflow::ParseProtoUnlimited(&event, raw_proto));
+        CHECK(::tensorflow::ParseProtoUnlimited(&event, raw_proto.data(),
+                                                raw_proto.size()));
         if (event.what_case() != Event::WhatCase::kSummary) {
           continue;
         }
diff --git a/lingvo/core/ops/generic_input_op_kernels.cc b/lingvo/core/ops/generic_input_op_kernels.cc
@@ -122,7 +122,7 @@ class GenericInputProcessor : public RecordProcessor {
     args[0] = Tensor(DT_INT32, {});
     args[0].scalar<int32>()() = record.source_id;
     args[1] = Tensor(DT_STRING, {});
-    record.value.AppendTo(&args[1].scalar<string>()());
+    args[1].scalar<tensorflow::tstring>()().append(record.value.ToString());
     *bucket_key = 1;
     sample->clear();
     Status status;
@@ -266,7 +266,7 @@ class GenericInputProcessor : public RecordProcessor {
                         CASE(float);
                         CASE(int32);
                         CASE(int64);
-                        CASE(string);
+                        CASE(tstring);
                         CASE(uint8);
                         CASE(bfloat16);
                         CASE(complex64);
diff --git a/lingvo/core/ops/record_batcher_test.cc b/lingvo/core/ops/record_batcher_test.cc
@@ -52,7 +52,7 @@ class TestRP : public RecordProcessor {
     const string val = string(record.value);
     *bucket_key = val.size();
     Tensor t(DT_STRING, {});
-    record.value.AppendTo(&t.scalar<string>()());
+    t.scalar<tstring>()().append(record.value.ToString());
     Tensor ids(DT_STRING, {1});
     auto lab = ids.flat<tensorflow::tstring>();
     lab(0) = absl::StrCat(record.source_id);
@@ -68,8 +68,8 @@ class TestRP : public RecordProcessor {
     Tensor t(DT_STRING, {n});
     Tensor source_ids(DT_STRING, {n});
     for (int i = 0; i < samples.size(); ++i) {
-      t.flat<string>()(i) = samples[i][0].scalar<string>()();
-      source_ids.flat<string>()(i) = samples[i][1].scalar<string>()();
+      t.flat<tstring>()(i) = samples[i][0].scalar<tstring>()();
+      source_ids.flat<tstring>()(i) = samples[i][1].scalar<tstring>()();
     }
     batch->clear();
     batch->push_back(std::move(t));
@@ -107,13 +107,13 @@ TEST(RecordBatcher, Basic) {
     ASSERT_LE(source_ids.dim_size(0), bopts.bucket_batch_limit[bucket_id]);
     int maxlen = 0;
     for (int j = 0; j < t.dim_size(0); ++j) {
-      auto len = t.vec<string>()(j).size();
+      auto len = t.vec<tstring>()(j).size();
       EXPECT_LE(len, bopts.bucket_upper_bound[bucket_id]);
       if (bucket_id != 0) {
         EXPECT_LT(bopts.bucket_upper_bound[bucket_id - 1], len);
       }
       maxlen = std::max<int>(maxlen, len);
-      ASSERT_EQ(source_ids.vec<string>()(j), "0");
+      ASSERT_EQ(source_ids.vec<tstring>()(j), "0");
     }
     VLOG(1) << bucket_id << " " << t.dim_size(0) << " " << maxlen;
   }
@@ -146,7 +146,7 @@ TEST(RecordBatcher, BasicMultiThread) {
     ASSERT_LE(t.dim_size(0), bopts.bucket_batch_limit[bucket_id]);
     int maxlen = 0;
     for (int j = 0; j < t.dim_size(0); ++j) {
-      auto len = t.vec<string>()(j).size();
+      auto len = t.vec<tstring>()(j).size();
       EXPECT_LE(len, bopts.bucket_upper_bound[bucket_id]);
       if (bucket_id != 0) {
         EXPECT_LT(bopts.bucket_upper_bound[bucket_id - 1], len);
@@ -195,7 +195,7 @@ TEST(RecordBatcher, LearnBuckets) {
     const Tensor& t = batch[0];
     int maxlen = 0;
     for (int j = 0; j < t.dim_size(0); ++j) {
-      int len = t.vec<string>()(j).size();
+      int len = t.vec<tstring>()(j).size();
       maxlen = std::max<int>(maxlen, len);
     }
     maxlens[bucket_id] += maxlen;
@@ -241,7 +241,7 @@ TEST(RecordBatcher, FullEpoch) {
     TF_CHECK_OK(batcher.GetNext(&bucket_id, &batch));
     const Tensor& t = batch[0];
     for (int j = 0; j < t.dim_size(0); ++j) {
-      records.push_back(t.vec<string>()(j));
+      records.push_back(t.vec<tstring>()(j));
     }
   }
   ASSERT_EQ(N, records.size());
@@ -279,7 +279,7 @@ TEST(RecordBatcher, CaptureYielderStatus) {
       const Tensor& t = batch[i];
       if (t.dtype() == DT_STRING) {
         for (int j = 0; j < t.dim_size(0); ++j) {
-          records.push_back(t.vec<string>()(j));
+          records.push_back(t.vec<tstring>()(j));
         }
       }
     }
diff --git a/lingvo/core/ops/record_yielder.cc b/lingvo/core/ops/record_yielder.cc
@@ -250,7 +250,7 @@ class TFRecordIterator : public RecordIterator {
   std::unique_ptr<RandomAccessFile> file_;
   io::SequentialRecordReader reader_;
   int64 num_ = 0;
-  string record_;
+  tstring record_;
 
   io::RecordReaderOptions ReaderOptions(const string& compression_type) {
     auto opts =
diff --git a/lingvo/core/ops/rope.h b/lingvo/core/ops/rope.h
@@ -30,6 +30,8 @@ class Rope : public std::string {
   void AppendTo(std::string* dst) const {
     dst->append(*this);
   }
+
+  std::string ToString() const { return *this; }
 };
 
 }  // namespace lingvo
diff --git a/lingvo/core/ops/simple_vocab.cc b/lingvo/core/ops/simple_vocab.cc
@@ -161,13 +161,13 @@ class VocabTokenToIdOp : public OpKernel {
     Tensor* id;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("id", token->shape(), &id));
     if (token->dims() == 0) {
-      id->scalar<int32>()() = vocab_.TokenToId(token->scalar<string>()());
+      id->scalar<int32>()() = vocab_.TokenToId(token->scalar<tstring>()());
     } else {
       OP_REQUIRES(
           ctx, token->dims() == 1,
           errors::InvalidArgument("Input must be a scalar or 1D tensor."));
       for (int i = 0; i < token->dim_size(0); i++) {
-        id->vec<int32>()(i) = vocab_.TokenToId(token->vec<string>()(i));
+        id->vec<int32>()(i) = vocab_.TokenToId(token->vec<tstring>()(i));
       }
     }
   }
@@ -196,13 +196,13 @@ class VocabIdToTokenOp : public OpKernel {
     Tensor* token;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("token", id->shape(), &token));
     if (id->dims() == 0) {
-      token->scalar<string>()() = vocab_.IdToToken(id->scalar<int32>()());
+      token->scalar<tstring>()() = vocab_.IdToToken(id->scalar<int32>()());
     } else {
       OP_REQUIRES(
           ctx, id->dims() == 1,
           errors::InvalidArgument("Input must be a scalar or 1D tensor."));
       for (int i = 0; i < id->dim_size(0); i++) {
-        token->vec<string>()(i) = vocab_.IdToToken(id->vec<int32>()(i));
+        token->vec<tstring>()(i) = vocab_.IdToToken(id->vec<int32>()(i));
       }
     }
   }
@@ -232,13 +232,13 @@ class TokenInVocabOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("result", token->shape(), &result));
     if (token->dims() == 0) {
-      result->scalar<bool>()() = vocab_.InVocab(token->scalar<string>()());
+      result->scalar<bool>()() = vocab_.InVocab(token->scalar<tstring>()());
     } else {
       OP_REQUIRES(
           ctx, token->dims() == 1,
           errors::InvalidArgument("Input must be a scalar or 1D tensor."));
       for (int i = 0; i < token->dim_size(0); i++) {
-        result->vec<bool>()(i) = vocab_.InVocab(token->vec<string>()(i));
+        result->vec<bool>()(i) = vocab_.InVocab(token->vec<tstring>()(i));
       }
     }
   }
diff --git a/lingvo/core/ops/static_map_op.cc b/lingvo/core/ops/static_map_op.cc
@@ -34,7 +34,7 @@ void Iota(std::vector<T>* vec) {
 }
 
 template <>
-void Iota<string>(std::vector<string>* vec) {
+void Iota<tstring>(std::vector<tstring>* vec) {
   // Do nothing.
 }
 
@@ -96,22 +96,22 @@ class StaticMapOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("StaticMapStringInt").Device(DEVICE_CPU),
-                        StaticMapOp<string, int32>);
+                        StaticMapOp<tstring, int32>);
 REGISTER_KERNEL_BUILDER(Name("StaticMapIntString").Device(DEVICE_CPU),
-                        StaticMapOp<int32, string>);
+                        StaticMapOp<int32, tstring>);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("StaticMapStringInt")
                             .Device(DEVICE_GPU)
                             .HostMemory("x")
                             .HostMemory("y"),
-                        StaticMapOp<string, int32>);
+                        StaticMapOp<tstring, int32>);
 
 REGISTER_KERNEL_BUILDER(Name("StaticMapIntString")
                             .Device(DEVICE_GPU)
                             .HostMemory("x")
                             .HostMemory("y"),
-                        StaticMapOp<int32, string>);
+                        StaticMapOp<int32, tstring>);
 #endif
 
 }  // namespace
diff --git a/lingvo/core/ops/tokenizer_op_headers.h b/lingvo/core/ops/tokenizer_op_headers.h
@@ -46,7 +46,7 @@ class LabelToTokenIdOp : public OpKernel {
                 errors::InvalidArgument("labels must be a vector, but get ",
                                         labels.shape().DebugString()));
     const int batch = labels.NumElements();
-    auto Tlabels = labels.flat<string>();
+    auto Tlabels = labels.flat<tstring>();
     Tensor token_ids(DT_INT32, TensorShape({batch, maxlen_}));
     auto Ttoken_ids = token_ids.matrix<int32>();
     Ttoken_ids.setZero();  // Sanity
@@ -131,7 +131,7 @@ class IdToTokenOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({batch}), &out));
     const auto& t_ids = ids.matrix<int32>();
     const auto& t_seq_lens = seq_lens.vec<int32>();
-    auto t_out = out->template vec<string>();
+    auto t_out = out->template vec<tstring>();
     for (int i = 0; i < batch; ++i) {
       const int len_i = std::max(0, t_seq_lens(i));
       std::vector<int32> ids_i(len_i);
diff --git a/lingvo/core/ops/tokenizer_ops_kernels.cc b/lingvo/core/ops/tokenizer_ops_kernels.cc
@@ -56,7 +56,7 @@ class StrToVocabTokensOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* labels;
     OP_REQUIRES_OK(ctx, ctx->input("labels", &labels));
-    const auto& t_label = labels->vec<string>();
+    const auto& t_label = labels->vec<tstring>();
     const int32 b_size = labels->dim_size(0);
     Tensor token_ids(DT_INT32, TensorShape({b_size, maxlen_}));
     Tensor target_ids(DT_INT32, TensorShape({b_size, maxlen_}));
@@ -177,7 +177,7 @@ class NgramIdToTokenOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({batch}), &out));
     const auto& t_ids = token_ids->matrix<int32>();
     const auto& t_seq_lens = seq_lengths->vec<int32>();
-    auto t_out = out->template vec<string>();
+    auto t_out = out->template vec<tstring>();
     for (int i = 0; i < batch; ++i) {
       const int len_i = std::max(0, t_seq_lens(i));
       std::vector<int32> ids_i(len_i);
@@ -235,7 +235,7 @@ class BpeIdsToWordsOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({batch}), &out));
     const auto& t_ids = token_ids->matrix<int32>();
     const auto& t_seq_lens = seq_lengths->vec<int32>();
-    auto t_out = out->template vec<string>();
+    auto t_out = out->template vec<tstring>();
     for (int i = 0; i < batch; ++i) {
       const int len_i = std::max(0, t_seq_lens(i));
       std::vector<string> labels;
@@ -293,7 +293,7 @@ class BpeWordsToIdsOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* labels;
     OP_REQUIRES_OK(ctx, ctx->input("labels", &labels));
-    const auto& t_label = labels->vec<string>();
+    const auto& t_label = labels->vec<tstring>();
     const int32 b_size = labels->dim_size(0);
     Tensor* token_ids = nullptr;
     Tensor* target_ids = nullptr;

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,8 @@ class Rope : public std::string {`
`30`	`30`	`void AppendTo(std::string* dst) const {`
`31`	`31`	`dst->append(*this);`
`32`	`32`	`}`
	`33`	`+`
	`34`	`+ std::string ToString() const { return *this; }`
`33`	`35`	`};`
`34`	`36`
`35`	`37`	`} // namespace lingvo`
Original file line number	Diff line number	Diff line change
`@@ -161,13 +161,13 @@ class VocabTokenToIdOp : public OpKernel {`
`161`	`161`	`Tensor* id;`
`162`	`162`	`OP_REQUIRES_OK(ctx, ctx->allocate_output("id", token->shape(), &id));`
`163`	`163`	`if (token->dims() == 0) {`
`164`		`- id->scalar<int32>()() = vocab_.TokenToId(token->scalar<string>()());`
	`164`	`+ id->scalar<int32>()() = vocab_.TokenToId(token->scalar<tstring>()());`
`165`	`165`	`} else {`
`166`	`166`	`OP_REQUIRES(`
`167`	`167`	`ctx, token->dims() == 1,`
`168`	`168`	`errors::InvalidArgument("Input must be a scalar or 1D tensor."));`
`169`	`169`	`for (int i = 0; i < token->dim_size(0); i++) {`
`170`		`- id->vec<int32>()(i) = vocab_.TokenToId(token->vec<string>()(i));`
	`170`	`+ id->vec<int32>()(i) = vocab_.TokenToId(token->vec<tstring>()(i));`
`171`	`171`	`}`
`172`	`172`	`}`
`173`	`173`	`}`
`@@ -196,13 +196,13 @@ class VocabIdToTokenOp : public OpKernel {`
`196`	`196`	`Tensor* token;`
`197`	`197`	`OP_REQUIRES_OK(ctx, ctx->allocate_output("token", id->shape(), &token));`
`198`	`198`	`if (id->dims() == 0) {`
`199`		`- token->scalar<string>()() = vocab_.IdToToken(id->scalar<int32>()());`
	`199`	`+ token->scalar<tstring>()() = vocab_.IdToToken(id->scalar<int32>()());`
`200`	`200`	`} else {`
`201`	`201`	`OP_REQUIRES(`
`202`	`202`	`ctx, id->dims() == 1,`
`203`	`203`	`errors::InvalidArgument("Input must be a scalar or 1D tensor."));`
`204`	`204`	`for (int i = 0; i < id->dim_size(0); i++) {`
`205`		`- token->vec<string>()(i) = vocab_.IdToToken(id->vec<int32>()(i));`
	`205`	`+ token->vec<tstring>()(i) = vocab_.IdToToken(id->vec<int32>()(i));`
`206`	`206`	`}`
`207`	`207`	`}`
`208`	`208`	`}`
`@@ -232,13 +232,13 @@ class TokenInVocabOp : public OpKernel {`
`232`	`232`	`OP_REQUIRES_OK(ctx,`
`233`	`233`	`ctx->allocate_output("result", token->shape(), &result));`
`234`	`234`	`if (token->dims() == 0) {`
`235`		`- result->scalar<bool>()() = vocab_.InVocab(token->scalar<string>()());`
	`235`	`+ result->scalar<bool>()() = vocab_.InVocab(token->scalar<tstring>()());`
`236`	`236`	`} else {`
`237`	`237`	`OP_REQUIRES(`
`238`	`238`	`ctx, token->dims() == 1,`
`239`	`239`	`errors::InvalidArgument("Input must be a scalar or 1D tensor."));`
`240`	`240`	`for (int i = 0; i < token->dim_size(0); i++) {`
`241`		`- result->vec<bool>()(i) = vocab_.InVocab(token->vec<string>()(i));`
	`241`	`+ result->vec<bool>()(i) = vocab_.InVocab(token->vec<tstring>()(i));`
`242`	`242`	`}`
`243`	`243`	`}`
`244`	`244`	`}`