Example script to generate text from Nietzsche's writings.

At least 20 epochs are required before the generated textstarts sounding coherent.

It is recommended to run this script on GPU, as recurrentnetworks are quite computationally intensive.

If you try this script on new data, make sure your corpushas at least ~100k characters. ~1M is better.

  1. from __future__ import print_function
  2. from keras.callbacks import LambdaCallback
  3. from keras.models import Sequential
  4. from keras.layers import Dense
  5. from keras.layers import LSTM
  6. from keras.optimizers import RMSprop
  7. from keras.utils.data_utils import get_file
  8. import numpy as np
  9. import random
  10. import sys
  11. import io
  12. path = get_file(
  13. 'nietzsche.txt',
  14. origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
  15. with io.open(path, encoding='utf-8') as f:
  16. text = f.read().lower()
  17. print('corpus length:', len(text))
  18. chars = sorted(list(set(text)))
  19. print('total chars:', len(chars))
  20. char_indices = dict((c, i) for i, c in enumerate(chars))
  21. indices_char = dict((i, c) for i, c in enumerate(chars))
  22. # cut the text in semi-redundant sequences of maxlen characters
  23. maxlen = 40
  24. step = 3
  25. sentences = []
  26. next_chars = []
  27. for i in range(0, len(text) - maxlen, step):
  28. sentences.append(text[i: i + maxlen])
  29. next_chars.append(text[i + maxlen])
  30. print('nb sequences:', len(sentences))
  31. print('Vectorization...')
  32. x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
  33. y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
  34. for i, sentence in enumerate(sentences):
  35. for t, char in enumerate(sentence):
  36. x[i, t, char_indices[char]] = 1
  37. y[i, char_indices[next_chars[i]]] = 1
  38. # build the model: a single LSTM
  39. print('Build model...')
  40. model = Sequential()
  41. model.add(LSTM(128, input_shape=(maxlen, len(chars))))
  42. model.add(Dense(len(chars), activation='softmax'))
  43. optimizer = RMSprop(learning_rate=0.01)
  44. model.compile(loss='categorical_crossentropy', optimizer=optimizer)
  45. def sample(preds, temperature=1.0):
  46. # helper function to sample an index from a probability array
  47. preds = np.asarray(preds).astype('float64')
  48. preds = np.log(preds) / temperature
  49. exp_preds = np.exp(preds)
  50. preds = exp_preds / np.sum(exp_preds)
  51. probas = np.random.multinomial(1, preds, 1)
  52. return np.argmax(probas)
  53. def on_epoch_end(epoch, _):
  54. # Function invoked at end of each epoch. Prints generated text.
  55. print()
  56. print('----- Generating text after Epoch: %d' % epoch)
  57. start_index = random.randint(0, len(text) - maxlen - 1)
  58. for diversity in [0.2, 0.5, 1.0, 1.2]:
  59. print('----- diversity:', diversity)
  60. generated = ''
  61. sentence = text[start_index: start_index + maxlen]
  62. generated += sentence
  63. print('----- Generating with seed: "' + sentence + '"')
  64. sys.stdout.write(generated)
  65. for i in range(400):
  66. x_pred = np.zeros((1, maxlen, len(chars)))
  67. for t, char in enumerate(sentence):
  68. x_pred[0, t, char_indices[char]] = 1.
  69. preds = model.predict(x_pred, verbose=0)[0]
  70. next_index = sample(preds, diversity)
  71. next_char = indices_char[next_index]
  72. sentence = sentence[1:] + next_char
  73. sys.stdout.write(next_char)
  74. sys.stdout.flush()
  75. print()
  76. print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
  77. model.fit(x, y,
  78. batch_size=128,
  79. epochs=60,
  80. callbacks=[print_callback])