User Tools

Site Tools


it:ai_tensorflow_nlpm

AI - deep learning text data using TensorFlow and NLPM

Introduction

  • this method uses a Neural Probabilistic Language Model (concept developed in 2003) which allows fairly minimalistic coding to achieve reasonable outcomes

Code example using a pre-trained Neural Probabilistic Language Model

set up and import dataset from a csv file

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler

import tensorflow as tf
import tensorflow_hub as hub

df= pd.read_csv("data/wine-reviews.csv", usecols=['country','description','points','price','variety','winery'])

df.head() #view some of the imported data

#split data into train, test and validation datasets:
train, val, test = np.split(df.sample(frac=1),[int(0.8*len(df)),int(0.9*len(df))])

#check how many are in each dataset:
len(train), len(val), len(test)

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=1024):
  df = dataframe.copy()
  labels = dataframe.pop('label')
  #df = {key:value[:,tf.newaxis] for key, value in dataframe.items()}
  df = df["description"] 
  #ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds
  
#convert into tensors:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

#check tensor content:
list(train_data)[0]

embed data with pre-trained text layer

  • tensor hub allows use of pre-trained layers such as a Neural Probabilistic Language Model, in this case, nnlm-en-dim50 which has 50 dimensions.
    • this is a Token based text embedding trained on English Google News 7B corpus and avoids the need to tokenize
    • set trainable to true if the train dataset being used is large enough to fine tune the parameters of the NNLM
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding,dtype=tf.string, trainable=True) 

'''WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures.
 The old module will be deleted in version 2.11.'''

#display embedded data:
hub_layer(list(train_data)[0][0]) #ie has converted the text to an array of numbers using nnlm-en-dim50

create the model

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16,activation="relu")) #16 is no. neurons)
model.add(tf.keras.layers.Dropout(0.4)) #to reduce over-filling
model.add(tf.keras.layers.Dense(16,activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

run the model and evaluate initial random parameters

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'])
             
model.evaluate(valid_data)

train the model

#now train the model:
history = model.fit(train_data, epochs=1, validation_data=(valid_data))

optionally, plot the output

#plot to demonstrate failure of model to improve when used on validation - issue of over-filling
#solution is to add layers of dropout
#this is killing my kernel:!!!

plt.plot(history.history[\'accuracy\'],label="Training acc")
plt.plot(history.history[\'val_accuracy\'],label="Validation acc")
plt.title("Accuracy of model")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend()
plt.show()

evaluate trained model on test data

model.evaluate(test_data)
#this should give loss: 0.4943 - accuracy: 0.7801
it/ai_tensorflow_nlpm.txt · Last modified: 2023/08/17 05:17 by gary1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki