|
| 1 | +# http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html?highlight=embedding |
| 2 | +# The following tutorial is from the PyTorch site. |
| 3 | +# ======================================================================= |
| 4 | + |
| 5 | +import torch |
| 6 | +import torch.nn as nn |
| 7 | +import torch.nn.functional as F |
| 8 | +import torch.optim as optim |
| 9 | + |
| 10 | +# Import VisualDL |
| 11 | +from visualdl import LogWriter |
| 12 | + |
| 13 | +torch.manual_seed(1) |
| 14 | +CONTEXT_SIZE = 2 |
| 15 | +EMBEDDING_DIM = 10 |
| 16 | +# We will use Shakespeare Sonnet 2 |
| 17 | +test_sentence = """When forty winters shall besiege thy brow, |
| 18 | +And dig deep trenches in thy beauty's field, |
| 19 | +Thy youth's proud livery so gazed on now, |
| 20 | +Will be a totter'd weed of small worth held: |
| 21 | +Then being asked, where all thy beauty lies, |
| 22 | +Where all the treasure of thy lusty days; |
| 23 | +To say, within thine own deep sunken eyes, |
| 24 | +Were an all-eating shame, and thriftless praise. |
| 25 | +How much more praise deserv'd thy beauty's use, |
| 26 | +If thou couldst answer 'This fair child of mine |
| 27 | +Shall sum my count, and make my old excuse,' |
| 28 | +Proving his beauty by succession thine! |
| 29 | +This were to be new made when thou art old, |
| 30 | +And see thy blood warm when thou feel'st it cold.""".split() |
| 31 | +# we should tokenize the input, but we will ignore that for now |
| 32 | +# build a list of tuples. Each tuple is ([ word_i-2, word_i-1 ], target word) |
| 33 | +trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) |
| 34 | + for i in range(len(test_sentence) - 2)] |
| 35 | +# print the first 3, just so you can see what they look like |
| 36 | +print(trigrams[:3]) |
| 37 | + |
| 38 | +vocab = set(test_sentence) |
| 39 | +word_to_ix = {word: i for i, word in enumerate(vocab)} |
| 40 | + |
| 41 | + |
| 42 | +class NGramLanguageModeler(nn.Module): |
| 43 | + def __init__(self, vocab_size, embedding_dim, context_size): |
| 44 | + super(NGramLanguageModeler, self).__init__() |
| 45 | + self.embeddings = nn.Embedding(vocab_size, embedding_dim) |
| 46 | + self.linear1 = nn.Linear(context_size * embedding_dim, 128) |
| 47 | + self.linear2 = nn.Linear(128, vocab_size) |
| 48 | + |
| 49 | + def forward(self, inputs): |
| 50 | + embeds = self.embeddings(inputs).view((1, -1)) |
| 51 | + out = F.relu(self.linear1(embeds)) |
| 52 | + out = self.linear2(out) |
| 53 | + log_probs = F.log_softmax(out, dim=1) |
| 54 | + return log_probs |
| 55 | + |
| 56 | + |
| 57 | +losses = [] |
| 58 | +loss_function = nn.NLLLoss() |
| 59 | +model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) |
| 60 | +optimizer = optim.SGD(model.parameters(), lr=0.001) |
| 61 | + |
| 62 | +for epoch in range(10): |
| 63 | + total_loss = torch.Tensor([0]) |
| 64 | + for context, target in trigrams: |
| 65 | + |
| 66 | + # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words |
| 67 | + # into integer indices and wrap them in variables) |
| 68 | + context_idxs = torch.tensor( |
| 69 | + [word_to_ix[w] for w in context], dtype=torch.long) |
| 70 | + |
| 71 | + # Step 2. Recall that torch *accumulates* gradients. Before passing in a |
| 72 | + # new instance, you need to zero out the gradients from the old |
| 73 | + # instance |
| 74 | + model.zero_grad() |
| 75 | + |
| 76 | + # Step 3. Run the forward pass, getting log probabilities over next |
| 77 | + # words |
| 78 | + log_probs = model(context_idxs) |
| 79 | + |
| 80 | + # Step 4. Compute your loss function. (Again, Torch wants the target |
| 81 | + # word wrapped in a variable) |
| 82 | + loss = loss_function( |
| 83 | + log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long)) |
| 84 | + |
| 85 | + # Step 5. Do the backward pass and update the gradient |
| 86 | + loss.backward() |
| 87 | + optimizer.step() |
| 88 | + |
| 89 | + # Get the Python number from a 1-element Tensor by calling tensor.item() |
| 90 | + total_loss += loss.item() |
| 91 | + losses.append(total_loss) |
| 92 | +print(losses) # The loss decreased every iteration over the training data! |
| 93 | + |
| 94 | +# VisualDL setup |
| 95 | +logw = LogWriter("./embedding_log", sync_cycle=10000) |
| 96 | +with logw.mode('train') as logger: |
| 97 | + embedding = logger.embedding() |
| 98 | + |
| 99 | +embeddings_list = model.embeddings.weight.data.numpy() # convert to numpy array |
| 100 | + |
| 101 | +# VisualDL embedding log writer takes two parameters |
| 102 | +# The first parameter is embedding list. The type is list[list[float]] |
| 103 | +# The second parameter is word_dict. The type is dictionary<string, int>. |
| 104 | +embedding.add_embeddings_with_word_dict(embeddings_list, word_to_ix) |
0 commit comments