Tipos de dados a serem trabalhados:
Algoritmos fundamentais:
Aplicações possíveis:
Aplicações que faremos:
Processo de text-vectorization consiste de aplicar a tokenização ao texto e associar vetores aos tokens.
Usamos os vetores associados aos tokens para os modelos de deep learning.
Os dois principais métodos de vetorização de palavras são one-hot encoding e embedding.
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
for word in sample.split():
if word not in token_index:
token_index[word] = len(token_index) + 1
max_length = 10
results = np.zeros(shape=(len(samples),
max_length,
max(token_index.values()) + 1))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split())) [:max_length]:
index = token_index.get(word)
results[i,j,index] = 1
results
array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
import string
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1,len(characters)+1), characters))
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.keys())+1))
for i, sample in enumerate(samples):
for j, character in enumerate(sample):
index = token_index.get(character)
results[i,j, index] = 1
results
array([[[1., 1., 1., ..., 1., 1., 1.], [1., 1., 1., ..., 1., 1., 1.], [1., 1., 1., ..., 1., 1., 1.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], [[1., 1., 1., ..., 1., 1., 1.], [1., 1., 1., ..., 1., 1., 1.], [1., 1., 1., ..., 1., 1., 1.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]])
Ainda que seja possível fazer o one-hot encoding 'na mão', é recomendado que seja feito com o Keras, pois ele saberá lidar com várias issues importantes quando lidamos com texto.
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
word_index = tokenizer.word_index
print('Found %d unique tokens.' %len(word_index))
Found 9 unique tokens.
word_index
{'ate': 7, 'cat': 2, 'dog': 6, 'homework': 9, 'mat': 5, 'my': 8, 'on': 4, 'sat': 3, 'the': 1}
one_hot_results
array([[0., 1., 1., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 0.]])
sequences
[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]
Quando a quantidade de tokens for suficientemente grande, vale usar o one-hot encoding hashing trick
Uma função hash é um algoritmo que mapeia dados de comprimento variável para dados de comprimento fixo.
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
index = abs(hash(word)) %dimensionality
results[i, j, index] = 1
results
array([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]], [[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]])
Material interessante sobre word2vec: The Illustrated Word2vec
Embeddings com Tensorflow: Embedding Projector
O Keras possui uma camada própria para isso, que facilita a vida: Embedding Layer
from keras.layers import Embedding
embedding_layer = Embedding(1000,4)
A camada Embedding é uma espécie de dicionário que mapeia as índices inteiros em vetores densos.
Word index -> Embedding layer -> Corresponding word vector
Precisamos de um vetor 2D de shape (samples, sequence_lenght) como entrada.
A camada Embedding aceita sequências de diversos tamanhos, mas todos as sequências de um mesmo batch precisa ser do mesmo tamanho:
- sequências pequenas são preenchidas por zeros e sequências grandes são truncadas.
A camada retorna uma tensor 3D (samples, sequence_length, embedding_dimensionality), que pode ser processada por uma camada RNN ou uma camada convolutiva 1D.
from keras.datasets import imdb
from keras import preprocessing
max_features = 10000
maxlen = 20
(x_train, y_train),(x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
x_train
array([[ 65, 16, 38, ..., 19, 178, 32], [ 23, 4, 1690, ..., 16, 145, 95], [1352, 13, 191, ..., 7, 129, 113], ..., [ 11, 1818, 7561, ..., 4, 3586, 2], [ 92, 401, 728, ..., 12, 9, 23], [ 764, 40, 4, ..., 204, 131, 9]], dtype=int32)
y_train
array([1, 0, 0, ..., 0, 1, 0])
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2)
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_4 (Embedding) (None, 20, 8) 80000 _________________________________________________________________ flatten_2 (Flatten) (None, 160) 0 _________________________________________________________________ dense_2 (Dense) (None, 1) 161 ================================================================= Total params: 80,161 Trainable params: 80,161 Non-trainable params: 0 _________________________________________________________________ Train on 20000 samples, validate on 5000 samples Epoch 1/10 20000/20000 [==============================] - 2s 79us/step - loss: 0.6582 - acc: 0.6409 - val_loss: 0.5989 - val_acc: 0.7000 Epoch 2/10 20000/20000 [==============================] - 1s 70us/step - loss: 0.5261 - acc: 0.7546 - val_loss: 0.5183 - val_acc: 0.7324 Epoch 3/10 20000/20000 [==============================] - 1s 72us/step - loss: 0.4571 - acc: 0.7893 - val_loss: 0.4971 - val_acc: 0.7480 Epoch 4/10 20000/20000 [==============================] - 1s 71us/step - loss: 0.4226 - acc: 0.8079 - val_loss: 0.4943 - val_acc: 0.7530 Epoch 5/10 20000/20000 [==============================] - 1s 70us/step - loss: 0.3994 - acc: 0.8203 - val_loss: 0.4929 - val_acc: 0.7542 Epoch 6/10 20000/20000 [==============================] - 1s 71us/step - loss: 0.3798 - acc: 0.8315 - val_loss: 0.4945 - val_acc: 0.7552 Epoch 7/10 20000/20000 [==============================] - 1s 70us/step - loss: 0.3619 - acc: 0.8406 - val_loss: 0.4992 - val_acc: 0.7556 Epoch 8/10 20000/20000 [==============================] - 1s 72us/step - loss: 0.3452 - acc: 0.8516 - val_loss: 0.5027 - val_acc: 0.7600 Epoch 9/10 20000/20000 [==============================] - 1s 71us/step - loss: 0.3290 - acc: 0.8614 - val_loss: 0.5077 - val_acc: 0.7564 Epoch 10/10 20000/20000 [==============================] - 1s 71us/step - loss: 0.3130 - acc: 0.8687 - val_loss: 0.5150 - val_acc: 0.7552
Obtivemos uma acurácia de ~75%
Esse modelo não leva em conta as relações entre as palavras e a estrutura da sequência.
08/11
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import os
imdb_dir = r'/content/drive/My Drive/Deep_Learning/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname))
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-20-42c7d6a25601> in <module>() 1 for label_type in ['neg', 'pos']: 2 dir_name = os.path.join(train_dir, label_type) ----> 3 for fname in os.listdir(dir_name): 4 if fname[-4:] == '.txt': 5 f = open(os.path.join(dir_name, fname)) FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Deep_Learning/aclImdb/train/neg'
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
Found 0 unique tokens. Shape of data tensor: (0, 100) Shape of label tensor: (0,)
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Deep_Learning/aclImdb/teste.csv')
df.head()
a | b | c | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 1 | 2 | 3 |
3 | 2 | 3 | 5 |
4 | 7 | 11 | 13 |