feat(nlp): improve exploitable phrases

Signed-off-by: Sphericalkat <me@kat.bio>
This commit is contained in:
Amogh Lele 2024-05-25 17:57:15 +05:30
parent dd1e11bd45
commit 8e8bed9813
Signed by: sphericalkat
GPG Key ID: 1C022B9CED2425B4
2 changed files with 69 additions and 20 deletions

20
main.py
View File

@ -1,4 +1,5 @@
import logging
import random
from telegram import Update
from telegram.ext import (
@ -9,7 +10,7 @@ from telegram.ext import (
filters,
)
from nlp import is_noun_follows_verb
import nlp
from settings import Settings
# Load environment variables
@ -29,23 +30,18 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
async def message_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
# Get the message text content
msg_content = update.effective_message.text
msg = update.effective_message
msg_content = msg.text
# Ignore messages without text
if not msg_content:
return
# Check that the message doesn't have more than 5 words
if len(msg_content.split()) > 5:
return
# generate exploitable phrases
exploitable_phrases = nlp.find_exploitable_phrases(msg_content)
# Check if a noun immediately follows a verb
is_follows_verb, verb = is_noun_follows_verb(msg_content)
if is_follows_verb:
await update.effective_message.reply_text(f"{verb} deez")
phrase = random.choice(exploitable_phrases)
return
await msg.reply_text(phrase)
if __name__ == "__main__":

65
nlp.py
View File

@ -7,6 +7,7 @@ nlp = spacy.load("en_core_web_sm")
def is_noun_follows_verb(text: str) -> bool:
doc = nlp(text)
for i in range(len(doc) - 1):
print(doc[i].pos_, doc[i].text, doc[i + 1].pos_, doc[i + 1].text)
# Check if the current token is a verb and the next token is a noun
if doc[i].pos_ == "VERB" and doc[i + 1].pos_ in ["NOUN", "PROPN", "PRON"]:
return True, doc[i].lemma_
@ -14,10 +15,62 @@ def is_noun_follows_verb(text: str) -> bool:
return False, None
def find_exploitable_phrases(sentence):
# Parse the sentence using spaCy
doc = nlp(sentence)
exploitable_phrases = []
for token in doc:
if token.pos_ == "VERB":
# Collect the verb and its relevant preceding words
phrase = [token.lemma_]
preceding_tokens = []
# Collect adverbs and prepositions that are syntactically dependent on the verb
# eg: I am testing for bugs. -> "for" is dependent on "testing"
for child in token.children:
if child.dep_ in {"advmod", "neg", "prep"}:
preceding_tokens.append(child)
# Sort the preceding tokens by their position in the sentence
# this makes it sound natural
preceding_tokens = sorted(preceding_tokens, key=lambda x: x.i)
# Add the sorted preceding tokens to the phrase
# depending on certain conditions
for t in preceding_tokens:
# if the token is a preposition, add the preposition and its dependent to the phrase
if t.dep_ == "prep":
phrase.append(t.text)
# if the preposition has a dependent which is a
# prepositional complement, add the dependent to the phrase
for subchild in t.children:
if subchild.dep_ in {"pcomp"}:
phrase.append(subchild.text)
# otherwise, add the token to the beginning of the phrase
else:
phrase.insert(0, t.text)
phrase_text = " ".join(phrase)
exploitable_phrases.append(phrase_text)
return exploitable_phrases
if __name__ == "__main__":
text = "I was eating pizza"
is_follows_verb, verb = is_noun_follows_verb(text)
if is_follows_verb:
print(f"{verb} deez")
else:
print("No noun follows a verb in the text")
# Test the function
sentences = [
"I am testing for bugs.",
"She was speaking at a conference.",
"He is looking into the issue.",
"They are working on the project.",
"Apple is looking at buying U.K. startup for $1 billion",
]
for sentence in sentences:
phrases = find_exploitable_phrases(sentence)
for phrase in phrases:
print(f"Original phrase: {sentence}")
print(f"Exploitable phrase: {phrase} deez")
print()