An implementation of sequence to sequence learning for performing addition

Input: "535+61"Output: "596"Padding is handled by using a repeated sentinel character (space)

Input may optionally be reversed, shown to increase performance in many tasks in:"Learning to Execute"http://arxiv.org/abs/1410.4615and"Sequence to Sequence Learning with Neural Networks"http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdfTheoretically it introduces shorter term dependencies between source and target.

Two digits reversed:+ One layer LSTM (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

Three digits reversed:+ One layer LSTM (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

Four digits reversed:+ One layer LSTM (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

Five digits reversed:+ One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs

  1. from __future__ import print_function
  2. from keras.models import Sequential
  3. from keras import layers
  4. import numpy as np
  5. from six.moves import range
  6. class CharacterTable(object):
  7. """Given a set of characters:
  8. + Encode them to a one-hot integer representation
  9. + Decode the one-hot or integer representation to their character output
  10. + Decode a vector of probabilities to their character output
  11. """
  12. def __init__(self, chars):
  13. """Initialize character table.
  14. # Arguments
  15. chars: Characters that can appear in the input.
  16. """
  17. self.chars = sorted(set(chars))
  18. self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
  19. self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
  20. def encode(self, C, num_rows):
  21. """One-hot encode given string C.
  22. # Arguments
  23. C: string, to be encoded.
  24. num_rows: Number of rows in the returned one-hot encoding. This is
  25. used to keep the # of rows for each data the same.
  26. """
  27. x = np.zeros((num_rows, len(self.chars)))
  28. for i, c in enumerate(C):
  29. x[i, self.char_indices[c]] = 1
  30. return x
  31. def decode(self, x, calc_argmax=True):
  32. """Decode the given vector or 2D array to their character output.
  33. # Arguments
  34. x: A vector or a 2D array of probabilities or one-hot representations;
  35. or a vector of character indices (used with `calc_argmax=False`).
  36. calc_argmax: Whether to find the character index with maximum
  37. probability, defaults to `True`.
  38. """
  39. if calc_argmax:
  40. x = x.argmax(axis=-1)
  41. return ''.join(self.indices_char[x] for x in x)
  42. class colors:
  43. ok = '\033[92m'
  44. fail = '\033[91m'
  45. close = '\033[0m'
  46. # Parameters for the model and dataset.
  47. TRAINING_SIZE = 50000
  48. DIGITS = 3
  49. REVERSE = True
  50. # Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
  51. # int is DIGITS.
  52. MAXLEN = DIGITS + 1 + DIGITS
  53. # All the numbers, plus sign and space for padding.
  54. chars = '0123456789+ '
  55. ctable = CharacterTable(chars)
  56. questions = []
  57. expected = []
  58. seen = set()
  59. print('Generating data...')
  60. while len(questions) < TRAINING_SIZE:
  61. f = lambda: int(''.join(np.random.choice(list('0123456789'))
  62. for i in range(np.random.randint(1, DIGITS + 1))))
  63. a, b = f(), f()
  64. # Skip any addition questions we've already seen
  65. # Also skip any such that x+Y == Y+x (hence the sorting).
  66. key = tuple(sorted((a, b)))
  67. if key in seen:
  68. continue
  69. seen.add(key)
  70. # Pad the data with spaces such that it is always MAXLEN.
  71. q = '{}+{}'.format(a, b)
  72. query = q + ' ' * (MAXLEN - len(q))
  73. ans = str(a + b)
  74. # Answers can be of maximum size DIGITS + 1.
  75. ans += ' ' * (DIGITS + 1 - len(ans))
  76. if REVERSE:
  77. # Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the
  78. # space used for padding.)
  79. query = query[::-1]
  80. questions.append(query)
  81. expected.append(ans)
  82. print('Total addition questions:', len(questions))
  83. print('Vectorization...')
  84. x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)
  85. y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)
  86. for i, sentence in enumerate(questions):
  87. x[i] = ctable.encode(sentence, MAXLEN)
  88. for i, sentence in enumerate(expected):
  89. y[i] = ctable.encode(sentence, DIGITS + 1)
  90. # Shuffle (x, y) in unison as the later parts of x will almost all be larger
  91. # digits.
  92. indices = np.arange(len(y))
  93. np.random.shuffle(indices)
  94. x = x[indices]
  95. y = y[indices]
  96. # Explicitly set apart 10% for validation data that we never train over.
  97. split_at = len(x) - len(x) // 10
  98. (x_train, x_val) = x[:split_at], x[split_at:]
  99. (y_train, y_val) = y[:split_at], y[split_at:]
  100. print('Training Data:')
  101. print(x_train.shape)
  102. print(y_train.shape)
  103. print('Validation Data:')
  104. print(x_val.shape)
  105. print(y_val.shape)
  106. # Try replacing GRU, or SimpleRNN.
  107. RNN = layers.LSTM
  108. HIDDEN_SIZE = 128
  109. BATCH_SIZE = 128
  110. LAYERS = 1
  111. print('Build model...')
  112. model = Sequential()
  113. # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
  114. # Note: In a situation where your input sequences have a variable length,
  115. # use input_shape=(None, num_feature).
  116. model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars))))
  117. # As the decoder RNN's input, repeatedly provide with the last output of
  118. # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
  119. # length of output, e.g., when DIGITS=3, max output is 999+999=1998.
  120. model.add(layers.RepeatVector(DIGITS + 1))
  121. # The decoder RNN could be multiple layers stacked or a single layer.
  122. for _ in range(LAYERS):
  123. # By setting return_sequences to True, return not only the last output but
  124. # all the outputs so far in the form of (num_samples, timesteps,
  125. # output_dim). This is necessary as TimeDistributed in the below expects
  126. # the first dimension to be the timesteps.
  127. model.add(RNN(HIDDEN_SIZE, return_sequences=True))
  128. # Apply a dense layer to the every temporal slice of an input. For each of step
  129. # of the output sequence, decide which character should be chosen.
  130. model.add(layers.TimeDistributed(layers.Dense(len(chars), activation='softmax')))
  131. model.compile(loss='categorical_crossentropy',
  132. optimizer='adam',
  133. metrics=['accuracy'])
  134. model.summary()
  135. # Train the model each generation and show predictions against the validation
  136. # dataset.
  137. for iteration in range(1, 200):
  138. print()
  139. print('-' * 50)
  140. print('Iteration', iteration)
  141. model.fit(x_train, y_train,
  142. batch_size=BATCH_SIZE,
  143. epochs=1,
  144. validation_data=(x_val, y_val))
  145. # Select 10 samples from the validation set at random so we can visualize
  146. # errors.
  147. for i in range(10):
  148. ind = np.random.randint(0, len(x_val))
  149. rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
  150. preds = model.predict_classes(rowx, verbose=0)
  151. q = ctable.decode(rowx[0])
  152. correct = ctable.decode(rowy[0])
  153. guess = ctable.decode(preds[0], calc_argmax=False)
  154. print('Q', q[::-1] if REVERSE else q, end=' ')
  155. print('T', correct, end=' ')
  156. if correct == guess:
  157. print(colors.ok + '☑' + colors.close, end=' ')
  158. else:
  159. print(colors.fail + '☒' + colors.close, end=' ')
  160. print(guess)