You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
249 lines
7.7 KiB
249 lines
7.7 KiB
import nltk
|
|
from nltk.stem.snowball import GermanStemmer
|
|
stemmer = GermanStemmer()
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
import tflearn
|
|
import random
|
|
|
|
import os
|
|
import inspect
|
|
|
|
def getPath(file):
|
|
path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
|
path = os.path.join(path, file).replace("\\", "/")
|
|
return path
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
def getJsonPath():
|
|
path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
|
path = os.path.join(path, 'chat.json').replace("\\", "/")
|
|
return path
|
|
|
|
|
|
# importiere das Dialog-design
|
|
with open(getJsonPath(), encoding='UTF-8') as json_data:
|
|
dialogflow = json.load(json_data)
|
|
|
|
words = []
|
|
classes = []
|
|
documents = []
|
|
stop= stopwords.words('german')
|
|
ignore_words = ['?', '.', ','] + stop
|
|
# loop durch jeden Satz in unseren dialogflow und synonym
|
|
for dialog in dialogflow['dialogflow']:
|
|
for pattern in dialog['synonym']:
|
|
# Tokenisieren jedes Wort im Satz
|
|
w = nltk.word_tokenize(pattern)
|
|
# füge die zu unserer Wörterliste hinzu
|
|
words.extend(w)
|
|
# füge die zu Dokumenten in unserem Korpus hinzu
|
|
documents.append((w, dialog['intent']))
|
|
# füge die zu unserer Klassenliste hinzu
|
|
if dialog['intent'] not in classes:
|
|
classes.append(dialog['intent'])
|
|
|
|
# stemme jedes Word und entferne Duplikate
|
|
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] + ['weit', 'and', 'nicht']
|
|
words = sorted(list(set(words)))
|
|
|
|
# sortiere unsere Klassen
|
|
classes = sorted(list(set(classes)))
|
|
|
|
print(len(documents), "Docs")
|
|
print(len(classes), "Classes", classes)
|
|
print(len(words), "Split words", words)
|
|
|
|
# erstelle unsere training data
|
|
training = []
|
|
output = []
|
|
# Erstelle ein leeres Array für unsere Output
|
|
output_empty = [0] * len(classes)
|
|
|
|
# generiere training set und bag of words für jeden Satz
|
|
for doc in documents:
|
|
# Initialisierung unsere bag of words
|
|
bag = []
|
|
# Liste der tokenisierte Wörter für den synonym
|
|
pattern_words = doc[0]
|
|
# stemme jedes Wort
|
|
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
|
|
# erstelle unsre bag of words array
|
|
for w in words:
|
|
bag.append(1) if w in pattern_words else bag.append(0)
|
|
# output ist '0' für jedes intent und '1' für das aktuelle intent
|
|
output_row = list(output_empty)
|
|
output_row[classes.index(doc[1])] = 1
|
|
|
|
training.append([bag, output_row])
|
|
|
|
# mische unsere Features und verwandle die in np.array
|
|
random.shuffle(training)
|
|
training = np.array(training)
|
|
|
|
# Erstelle die Training-Liste
|
|
train_x = list(training[:, 0])
|
|
train_y = list(training[:, 1])
|
|
|
|
|
|
#tf.reset_default_graph()
|
|
tf.compat.v1.reset_default_graph()
|
|
# Aufbau des neuronalen Netzes
|
|
net = tflearn.input_data(shape=[None, len(train_x[0])])
|
|
net = tflearn.fully_connected(net, 88)
|
|
net = tflearn.fully_connected(net, 88)
|
|
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
|
|
net = tflearn.regression(net)
|
|
|
|
# Definiere das Modell und konfiguriere tensorboard
|
|
model = tflearn.DNN(net, tensorboard_dir=getPath('train_logs'))
|
|
# Starte das training des Modells
|
|
model.fit(train_x, train_y, n_epoch=1000, batch_size=256, show_metric=True)
|
|
# Speichere das trainirte Modell
|
|
model.save(getPath('model.tflearn'))
|
|
|
|
|
|
print("model created")
|
|
|
|
print("model created")
|
|
#Bearbeitung der Benutzereingaben, um einen bag-of-words zu erzeugen
|
|
def frageBearbeitung(frage):
|
|
# tokenisiere die synonymen
|
|
sentence_word = nltk.word_tokenize(frage, language='german')
|
|
# generiere die Stopwörter
|
|
stop= stopwords.words('german')
|
|
ignore_words = ['?', '.', ','] + stop
|
|
######Korrektur Schreibfehler
|
|
sentence_words=[]
|
|
for word in sentence_word:
|
|
if word not in ignore_words or word=='weiter' or word=='andere' or word=='nicht':
|
|
#a=correction(word)
|
|
sentence_words.append(word)
|
|
# stemme jedes Wort
|
|
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
|
|
return sentence_words
|
|
|
|
# Rückgabe bag of words array: 0 oder 1 für jedes Wort in der 'bag', die im Satz existiert
|
|
def bow(frage, words, show_details=False):
|
|
sentence_words = frageBearbeitung(frage)
|
|
bag = [0] * len(words)
|
|
for s in sentence_words:
|
|
for i, w in enumerate(words):
|
|
if w == s:
|
|
bag[i] = 1
|
|
if show_details:
|
|
print("found in bag: %s" % w)
|
|
|
|
return (np.array(bag))
|
|
|
|
|
|
|
|
|
|
def lowercase(obj):
|
|
""" Make dictionary lowercase """
|
|
if isinstance(obj, dict):
|
|
return {k: lowercase(v) for k, v in obj.items()}
|
|
elif isinstance(obj, (list, set, tuple)):
|
|
t = type(obj)
|
|
return t(lowercase(o) for o in obj)
|
|
elif isinstance(obj, str):
|
|
if " " in obj:
|
|
object=[]
|
|
o=nltk.word_tokenize(obj)
|
|
for i in o:
|
|
i=stemmer.stem(i.lower())
|
|
object.append(i)
|
|
s = ' '
|
|
object = s.join(object)
|
|
return object
|
|
|
|
else:
|
|
return stemmer.stem(obj.lower())
|
|
else:
|
|
return obj
|
|
ERROR_THRESHOLD=0
|
|
def klassifizieren(frage):
|
|
# generiere Wahrscheinlichkeiten von dem Modell
|
|
|
|
p = bow(frage, words, show_details=False)
|
|
results = model.predict(np.array([p]))[0]
|
|
|
|
# herausfiltern Vorhersagen unterhalb eines Schwellenwerts
|
|
results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]
|
|
# nach Stärke der Wahrscheinlichkeit sortieren
|
|
results.sort(key=lambda x: x[1], reverse=True)
|
|
return_list = []
|
|
for r in results:
|
|
return_list.append((classes[r[0]], r[1]))
|
|
return return_list
|
|
print(klassifizieren('hallo'))
|
|
|
|
hilf_entiti = {}
|
|
leistung_entiti = {}
|
|
lebenslage_entiti = {}
|
|
|
|
|
|
|
|
for entities in dialogflow['entities_hilfe']:
|
|
for wert in entities['wert']:
|
|
# Tokenisieren jedes Wort im Satz
|
|
w = nltk.word_tokenize(wert)
|
|
# füge die zu Dokumenten in unserem Korpus hinzu
|
|
try:
|
|
hilf_entiti[entities['entitie']] = lowercase(entities['wert'])
|
|
except KeyError:
|
|
hilf_entiti[entities['entitie']] = lowercase(entities['wert'])
|
|
|
|
|
|
for entities in dialogflow['entities_leistungen']:
|
|
for wert in entities['wert']:
|
|
# Tokenisieren jedes Wort im Satz
|
|
w = nltk.word_tokenize(wert)
|
|
# füge die zu Dokumenten in unserem Korpus hinzu
|
|
try:
|
|
leistung_entiti[entities['entitie']] = lowercase(entities['wert'])
|
|
except KeyError:
|
|
leistung_entiti[entities['entitie']] = lowercase(entities['wert'])
|
|
|
|
for entities in dialogflow['entities_lebenslage']:
|
|
for wert in entities['wert']:
|
|
# Tokenisieren jedes Wort im Satz
|
|
w = nltk.word_tokenize(wert)
|
|
# füge die zu Dokumenten in unserem Korpus hinzu
|
|
try:
|
|
lebenslage_entiti[entities['entitie']] = entities['wert']
|
|
except KeyError:
|
|
lebenslage_entiti[entities['entitie']] = entities['wert']
|
|
|
|
|
|
import pickle
|
|
|
|
#pickle.dump(model, open(getPath('model.keras'), "wb"))
|
|
|
|
#pickle.dump(model, getPath("katana-assistant-model.pkl", "wb"))
|
|
|
|
# speichere alle unsere Datenstrukturen
|
|
pickle.dump({'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y},
|
|
open(getPath('trained_data'), "wb"))
|
|
|
|
|
|
|
|
|
|
with open(getPath('hilfe_data'), "wb") as filehandlehilfe:
|
|
# store the data as binary data stream
|
|
pickle.dump(hilf_entiti, filehandlehilfe)
|
|
|
|
with open(getPath('leistung_data'), "wb") as filehandleleistung:
|
|
# store the data as binary data stream
|
|
pickle.dump(leistung_entiti, filehandleleistung)
|
|
|
|
with open(getPath('lebenslage_data'), "wb") as filehandlelebenslage:
|
|
# store the data as binary data stream
|
|
pickle.dump(lebenslage_entiti, filehandlelebenslage)
|
|
|