|
| 1 | +# How to visualize embedding with VisualDL |
| 2 | + |
| 3 | +Here we would like to show you how to visualize embeddings with |
| 4 | +VisualDL in PyTorch. |
| 5 | + |
| 6 | +Embedding is often used in NLP(Nature Language Processing), it can represent the |
| 7 | +sematic meanings with high dimensional vectors. |
| 8 | + |
| 9 | +Embedding visualization is useful to verify the training algorithm, |
| 10 | +as visualization can reduce the high dimensional vector to 2D / 3D spaces. |
| 11 | +The closer two words are, the more sematic meaning they share. |
| 12 | + |
| 13 | +We use the PyTorch [embedding example](http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html) as |
| 14 | +the base. Here is the whole embedding program. The following block is a working python script. |
| 15 | +Feel free to test it in your python environment. |
| 16 | + |
| 17 | +``` |
| 18 | +import torch |
| 19 | +import torch.nn as nn |
| 20 | +import torch.nn.functional as F |
| 21 | +import torch.optim as optim |
| 22 | +
|
| 23 | +torch.manual_seed(1) |
| 24 | +
|
| 25 | +CONTEXT_SIZE = 2 |
| 26 | +EMBEDDING_DIM = 10 |
| 27 | +# We will use Shakespeare Sonnet 2 |
| 28 | +test_sentence = """When forty winters shall besiege thy brow, |
| 29 | +And dig deep trenches in thy beauty's field, |
| 30 | +Thy youth's proud livery so gazed on now, |
| 31 | +Will be a totter'd weed of small worth held: |
| 32 | +Then being asked, where all thy beauty lies, |
| 33 | +Where all the treasure of thy lusty days; |
| 34 | +To say, within thine own deep sunken eyes, |
| 35 | +Were an all-eating shame, and thriftless praise. |
| 36 | +How much more praise deserv'd thy beauty's use, |
| 37 | +If thou couldst answer 'This fair child of mine |
| 38 | +Shall sum my count, and make my old excuse,' |
| 39 | +Proving his beauty by succession thine! |
| 40 | +This were to be new made when thou art old, |
| 41 | +And see thy blood warm when thou feel'st it cold.""".split() |
| 42 | +# we should tokenize the input, but we will ignore that for now |
| 43 | +# build a list of tuples. Each tuple is ([ word_i-2, word_i-1 ], target word) |
| 44 | +trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) |
| 45 | + for i in range(len(test_sentence) - 2)] |
| 46 | +# print the first 3, just so you can see what they look like |
| 47 | +print(trigrams[:3]) |
| 48 | +
|
| 49 | +vocab = set(test_sentence) |
| 50 | +word_to_ix = {word: i for i, word in enumerate(vocab)} |
| 51 | +
|
| 52 | +
|
| 53 | +class NGramLanguageModeler(nn.Module): |
| 54 | +
|
| 55 | + def __init__(self, vocab_size, embedding_dim, context_size): |
| 56 | + super(NGramLanguageModeler, self).__init__() |
| 57 | + self.embeddings = nn.Embedding(vocab_size, embedding_dim) |
| 58 | + self.linear1 = nn.Linear(context_size * embedding_dim, 128) |
| 59 | + self.linear2 = nn.Linear(128, vocab_size) |
| 60 | +
|
| 61 | + def forward(self, inputs): |
| 62 | + embeds = self.embeddings(inputs).view((1, -1)) |
| 63 | + out = F.relu(self.linear1(embeds)) |
| 64 | + out = self.linear2(out) |
| 65 | + log_probs = F.log_softmax(out, dim=1) |
| 66 | + return log_probs |
| 67 | +
|
| 68 | +
|
| 69 | +losses = [] |
| 70 | +loss_function = nn.NLLLoss() |
| 71 | +model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) |
| 72 | +optimizer = optim.SGD(model.parameters(), lr=0.001) |
| 73 | +
|
| 74 | +for epoch in range(10): |
| 75 | + total_loss = torch.Tensor([0]) |
| 76 | + for context, target in trigrams: |
| 77 | +
|
| 78 | + # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words |
| 79 | + # into integer indices and wrap them in variables) |
| 80 | + context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long) |
| 81 | +
|
| 82 | + # Step 2. Recall that torch *accumulates* gradients. Before passing in a |
| 83 | + # new instance, you need to zero out the gradients from the old |
| 84 | + # instance |
| 85 | + model.zero_grad() |
| 86 | +
|
| 87 | + # Step 3. Run the forward pass, getting log probabilities over next |
| 88 | + # words |
| 89 | + log_probs = model(context_idxs) |
| 90 | +
|
| 91 | + # Step 4. Compute your loss function. (Again, Torch wants the target |
| 92 | + # word wrapped in a variable) |
| 93 | + loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long)) |
| 94 | +
|
| 95 | + # Step 5. Do the backward pass and update the gradient |
| 96 | + loss.backward() |
| 97 | + optimizer.step() |
| 98 | +
|
| 99 | + # Get the Python number from a 1-element Tensor by calling tensor.item() |
| 100 | + total_loss += loss.item() |
| 101 | + losses.append(total_loss) |
| 102 | +print(losses) # The loss decreased every iteration over the training data! |
| 103 | +``` |
| 104 | + |
| 105 | +That's all the code you need to generate your first embedding. |
| 106 | + |
| 107 | +Now, let us just add a little bit of code to store the embedding to VisualDL log |
| 108 | +so we can visualize it later. |
| 109 | + |
| 110 | +``` |
| 111 | +# Import VisualDL |
| 112 | +from visualdl import LogWriter |
| 113 | +# VisualDL setup |
| 114 | +logw = LogWriter("./embedding_log", sync_cycle=10000) |
| 115 | +with logw.mode('train') as logger: |
| 116 | + embedding = logger.embedding() |
| 117 | +
|
| 118 | +embeddings_list = model.embeddings.weight.data.numpy() # convert to numpy array |
| 119 | +
|
| 120 | +# VisualDL embedding log writer takes two parameters |
| 121 | +# The first parameter is embedding list. The type is list[list[float]] |
| 122 | +# The second parameter is word_dict. The type is dictionary<string, int>. |
| 123 | +embedding.add_embeddings_with_word_dict(embeddings_list, word_to_ix) |
| 124 | +``` |
| 125 | + |
| 126 | +Insert the above code snippet into your embedding training program. |
| 127 | + |
| 128 | +This will save the embeddings and the word dictionary to the `./embedding_log` folder. |
| 129 | + |
| 130 | +We can now activate the VisualDL by running `visualDL --logdir=./embedding_log`. |
| 131 | +Use your browser to navigate to `localhost:8080`, switch the tab to `High Dimensional` |
| 132 | + |
| 133 | +You can download the tutorial code [here](https://github.com/PaddlePaddle/VisualDL/blob/develop/demo/pytorch/pytorch_word2vec.py). |
0 commit comments