You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

249 lines
7.7 KiB

import nltk
from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer()
from nltk.corpus import stopwords
import numpy as np
import tensorflow as tf
import tflearn
import random
import os
import inspect
def getPath(file):
path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
path = os.path.join(path, file).replace("\\", "/")
return path
import json
def getJsonPath():
path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
path = os.path.join(path, 'chat.json').replace("\\", "/")
return path
# importiere das Dialog-design
with open(getJsonPath(), encoding='UTF-8') as json_data:
dialogflow = json.load(json_data)
words = []
classes = []
documents = []
stop= stopwords.words('german')
ignore_words = ['?', '.', ','] + stop
# loop durch jeden Satz in unseren dialogflow und synonym
for dialog in dialogflow['dialogflow']:
for pattern in dialog['synonym']:
# Tokenisieren jedes Wort im Satz
w = nltk.word_tokenize(pattern)
# füge die zu unserer Wörterliste hinzu
words.extend(w)
# füge die zu Dokumenten in unserem Korpus hinzu
documents.append((w, dialog['intent']))
# füge die zu unserer Klassenliste hinzu
if dialog['intent'] not in classes:
classes.append(dialog['intent'])
# stemme jedes Word und entferne Duplikate
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] + ['weit', 'and', 'nicht']
words = sorted(list(set(words)))
# sortiere unsere Klassen
classes = sorted(list(set(classes)))
print(len(documents), "Docs")
print(len(classes), "Classes", classes)
print(len(words), "Split words", words)
# erstelle unsere training data
training = []
output = []
# Erstelle ein leeres Array für unsere Output
output_empty = [0] * len(classes)
# generiere training set und bag of words für jeden Satz
for doc in documents:
# Initialisierung unsere bag of words
bag = []
# Liste der tokenisierte Wörter für den synonym
pattern_words = doc[0]
# stemme jedes Wort
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
# erstelle unsre bag of words array
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
# output ist '0' für jedes intent und '1' für das aktuelle intent
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
# mische unsere Features und verwandle die in np.array
random.shuffle(training)
training = np.array(training)
# Erstelle die Training-Liste
train_x = list(training[:, 0])
train_y = list(training[:, 1])
#tf.reset_default_graph()
tf.compat.v1.reset_default_graph()
# Aufbau des neuronalen Netzes
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 88)
net = tflearn.fully_connected(net, 88)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)
# Definiere das Modell und konfiguriere tensorboard
model = tflearn.DNN(net, tensorboard_dir=getPath('train_logs'))
# Starte das training des Modells
model.fit(train_x, train_y, n_epoch=1000, batch_size=256, show_metric=True)
# Speichere das trainirte Modell
model.save(getPath('model.tflearn'))
print("model created")
print("model created")
#Bearbeitung der Benutzereingaben, um einen bag-of-words zu erzeugen
def frageBearbeitung(frage):
# tokenisiere die synonymen
sentence_word = nltk.word_tokenize(frage, language='german')
# generiere die Stopwörter
stop= stopwords.words('german')
ignore_words = ['?', '.', ','] + stop
######Korrektur Schreibfehler
sentence_words=[]
for word in sentence_word:
if word not in ignore_words or word=='weiter' or word=='andere' or word=='nicht':
#a=correction(word)
sentence_words.append(word)
# stemme jedes Wort
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
# Rückgabe bag of words array: 0 oder 1 für jedes Wort in der 'bag', die im Satz existiert
def bow(frage, words, show_details=False):
sentence_words = frageBearbeitung(frage)
bag = [0] * len(words)
for s in sentence_words:
for i, w in enumerate(words):
if w == s:
bag[i] = 1
if show_details:
print("found in bag: %s" % w)
return (np.array(bag))
def lowercase(obj):
""" Make dictionary lowercase """
if isinstance(obj, dict):
return {k: lowercase(v) for k, v in obj.items()}
elif isinstance(obj, (list, set, tuple)):
t = type(obj)
return t(lowercase(o) for o in obj)
elif isinstance(obj, str):
if " " in obj:
object=[]
o=nltk.word_tokenize(obj)
for i in o:
i=stemmer.stem(i.lower())
object.append(i)
s = ' '
object = s.join(object)
return object
else:
return stemmer.stem(obj.lower())
else:
return obj
ERROR_THRESHOLD=0
def klassifizieren(frage):
# generiere Wahrscheinlichkeiten von dem Modell
p = bow(frage, words, show_details=False)
results = model.predict(np.array([p]))[0]
# herausfiltern Vorhersagen unterhalb eines Schwellenwerts
results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]
# nach Stärke der Wahrscheinlichkeit sortieren
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append((classes[r[0]], r[1]))
return return_list
print(klassifizieren('hallo'))
hilf_entiti = {}
leistung_entiti = {}
lebenslage_entiti = {}
for entities in dialogflow['entities_hilfe']:
for wert in entities['wert']:
# Tokenisieren jedes Wort im Satz
w = nltk.word_tokenize(wert)
# füge die zu Dokumenten in unserem Korpus hinzu
try:
hilf_entiti[entities['entitie']] = lowercase(entities['wert'])
except KeyError:
hilf_entiti[entities['entitie']] = lowercase(entities['wert'])
for entities in dialogflow['entities_leistungen']:
for wert in entities['wert']:
# Tokenisieren jedes Wort im Satz
w = nltk.word_tokenize(wert)
# füge die zu Dokumenten in unserem Korpus hinzu
try:
leistung_entiti[entities['entitie']] = lowercase(entities['wert'])
except KeyError:
leistung_entiti[entities['entitie']] = lowercase(entities['wert'])
for entities in dialogflow['entities_lebenslage']:
for wert in entities['wert']:
# Tokenisieren jedes Wort im Satz
w = nltk.word_tokenize(wert)
# füge die zu Dokumenten in unserem Korpus hinzu
try:
lebenslage_entiti[entities['entitie']] = entities['wert']
except KeyError:
lebenslage_entiti[entities['entitie']] = entities['wert']
import pickle
#pickle.dump(model, open(getPath('model.keras'), "wb"))
#pickle.dump(model, getPath("katana-assistant-model.pkl", "wb"))
# speichere alle unsere Datenstrukturen
pickle.dump({'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y},
open(getPath('trained_data'), "wb"))
with open(getPath('hilfe_data'), "wb") as filehandlehilfe:
# store the data as binary data stream
pickle.dump(hilf_entiti, filehandlehilfe)
with open(getPath('leistung_data'), "wb") as filehandleleistung:
# store the data as binary data stream
pickle.dump(leistung_entiti, filehandleleistung)
with open(getPath('lebenslage_data'), "wb") as filehandlelebenslage:
# store the data as binary data stream
pickle.dump(lebenslage_entiti, filehandlelebenslage)