This example demonstrates the use of fasttext for text classification

Based on Joulin et al's paper:

Bags of Tricks for Efficient Text Classification

Results on IMDB datasets with uni and bi-gram embeddings:

EmbeddingAccuracy, 5 epochsSpeed (s/epoch)Hardware
Uni-gram0.88138i7 CPU
Bi-gram0.90562GTx 980M GPU
  1. from __future__ import print_function
  2. import numpy as np
  3. from keras.preprocessing import sequence
  4. from keras.models import Sequential
  5. from keras.layers import Dense
  6. from keras.layers import Embedding
  7. from keras.layers import GlobalAveragePooling1D
  8. from keras.datasets import imdb
  9. def create_ngram_set(input_list, ngram_value=2):
  10. """
  11. Extract a set of n-grams from a list of integers.
  12. >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
  13. {(4, 9), (4, 1), (1, 4), (9, 4)}
  14. >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
  15. [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
  16. """
  17. return set(zip(*[input_list[i:] for i in range(ngram_value)]))
  18. def add_ngram(sequences, token_indice, ngram_range=2):
  19. """
  20. Augment the input list of list (sequences) by appending n-grams values.
  21. Example: adding bi-gram
  22. >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  23. >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
  24. >>> add_ngram(sequences, token_indice, ngram_range=2)
  25. [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
  26. Example: adding tri-gram
  27. >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
  28. >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
  29. >>> add_ngram(sequences, token_indice, ngram_range=3)
  30. [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
  31. """
  32. new_sequences = []
  33. for input_list in sequences:
  34. new_list = input_list[:]
  35. for ngram_value in range(2, ngram_range + 1):
  36. for i in range(len(new_list) - ngram_value + 1):
  37. ngram = tuple(new_list[i:i + ngram_value])
  38. if ngram in token_indice:
  39. new_list.append(token_indice[ngram])
  40. new_sequences.append(new_list)
  41. return new_sequences
  42. # Set parameters:
  43. # ngram_range = 2 will add bi-grams features
  44. ngram_range = 1
  45. max_features = 20000
  46. maxlen = 400
  47. batch_size = 32
  48. embedding_dims = 50
  49. epochs = 5
  50. print('Loading data...')
  51. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
  52. print(len(x_train), 'train sequences')
  53. print(len(x_test), 'test sequences')
  54. print('Average train sequence length: {}'.format(
  55. np.mean(list(map(len, x_train)), dtype=int)))
  56. print('Average test sequence length: {}'.format(
  57. np.mean(list(map(len, x_test)), dtype=int)))
  58. if ngram_range > 1:
  59. print('Adding {}-gram features'.format(ngram_range))
  60. # Create set of unique n-gram from the training set.
  61. ngram_set = set()
  62. for input_list in x_train:
  63. for i in range(2, ngram_range + 1):
  64. set_of_ngram = create_ngram_set(input_list, ngram_value=i)
  65. ngram_set.update(set_of_ngram)
  66. # Dictionary mapping n-gram token to a unique integer.
  67. # Integer values are greater than max_features in order
  68. # to avoid collision with existing features.
  69. start_index = max_features + 1
  70. token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
  71. indice_token = {token_indice[k]: k for k in token_indice}
  72. # max_features is the highest integer that could be found in the dataset.
  73. max_features = np.max(list(indice_token.keys())) + 1
  74. # Augmenting x_train and x_test with n-grams features
  75. x_train = add_ngram(x_train, token_indice, ngram_range)
  76. x_test = add_ngram(x_test, token_indice, ngram_range)
  77. print('Average train sequence length: {}'.format(
  78. np.mean(list(map(len, x_train)), dtype=int)))
  79. print('Average test sequence length: {}'.format(
  80. np.mean(list(map(len, x_test)), dtype=int)))
  81. print('Pad sequences (samples x time)')
  82. x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
  83. x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
  84. print('x_train shape:', x_train.shape)
  85. print('x_test shape:', x_test.shape)
  86. print('Build model...')
  87. model = Sequential()
  88. # we start off with an efficient embedding layer which maps
  89. # our vocab indices into embedding_dims dimensions
  90. model.add(Embedding(max_features,
  91. embedding_dims,
  92. input_length=maxlen))
  93. # we add a GlobalAveragePooling1D, which will average the embeddings
  94. # of all words in the document
  95. model.add(GlobalAveragePooling1D())
  96. # We project onto a single unit output layer, and squash it with a sigmoid:
  97. model.add(Dense(1, activation='sigmoid'))
  98. model.compile(loss='binary_crossentropy',
  99. optimizer='adam',
  100. metrics=['accuracy'])
  101. model.fit(x_train, y_train,
  102. batch_size=batch_size,
  103. epochs=epochs,
  104. validation_data=(x_test, y_test))