feat(nlp): improve exploitable phrases

Signed-off-by: Sphericalkat <me@kat.bio>
2024-05-25 17:57:15 +05:30 · 2024-05-25 17:57:15 +05:30 · 8e8bed9813
commit 8e8bed9813
parent dd1e11bd45
2 changed files with 69 additions and 20 deletions
--- a/main.py
+++ b/main.py
@ -1,4 +1,5 @@
 import logging
+import random

 from telegram import Update
 from telegram.ext import (
@ -9,7 +10,7 @@ from telegram.ext import (
    filters,
 )

-from nlp import is_noun_follows_verb
+import nlp
 from settings import Settings

 # Load environment variables
@ -29,23 +30,18 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):


 async def message_handler(update: Update, context: ContextTypes.DEFAULT_TYPE):
-    # Get the message text content
-    msg_content = update.effective_message.text
+    msg = update.effective_message
+    msg_content = msg.text

-    # Ignore messages without text
    if not msg_content:
        return

-    # Check that the message doesn't have more than 5 words
-    if len(msg_content.split()) > 5:
-        return
+    # generate exploitable phrases
+    exploitable_phrases = nlp.find_exploitable_phrases(msg_content)

-    # Check if a noun immediately follows a verb
-    is_follows_verb, verb = is_noun_follows_verb(msg_content)
-    if is_follows_verb:
-        await update.effective_message.reply_text(f"{verb} deez")
+    phrase = random.choice(exploitable_phrases)

-    return
+    await msg.reply_text(phrase)


 if __name__ == "__main__":
--- a/nlp.py
+++ b/nlp.py
@ -7,6 +7,7 @@ nlp = spacy.load("en_core_web_sm")
 def is_noun_follows_verb(text: str) -> bool:
    doc = nlp(text)
    for i in range(len(doc) - 1):
+        print(doc[i].pos_, doc[i].text, doc[i + 1].pos_, doc[i + 1].text)
        # Check if the current token is a verb and the next token is a noun
        if doc[i].pos_ == "VERB" and doc[i + 1].pos_ in ["NOUN", "PROPN", "PRON"]:
            return True, doc[i].lemma_
@ -14,10 +15,62 @@ def is_noun_follows_verb(text: str) -> bool:
    return False, None


+def find_exploitable_phrases(sentence):
+    # Parse the sentence using spaCy
+    doc = nlp(sentence)
+    exploitable_phrases = []
+
+    for token in doc:
+        if token.pos_ == "VERB":
+            # Collect the verb and its relevant preceding words
+            phrase = [token.lemma_]
+            preceding_tokens = []
+
+            # Collect adverbs and prepositions that are syntactically dependent on the verb
+            # eg: I am testing for bugs. -> "for" is dependent on "testing"
+            for child in token.children:
+                if child.dep_ in {"advmod", "neg", "prep"}:
+                    preceding_tokens.append(child)
+
+            # Sort the preceding tokens by their position in the sentence
+            # this makes it sound natural
+            preceding_tokens = sorted(preceding_tokens, key=lambda x: x.i)
+
+            # Add the sorted preceding tokens to the phrase
+            # depending on certain conditions
+            for t in preceding_tokens:
+                # if the token is a preposition, add the preposition and its dependent to the phrase
+                if t.dep_ == "prep":
+                    phrase.append(t.text)
+
+                    # if the preposition has a dependent which is a 
+                    # prepositional complement, add the dependent to the phrase
+                    for subchild in t.children:
+                        if subchild.dep_ in {"pcomp"}:
+                            phrase.append(subchild.text)
+                # otherwise, add the token to the beginning of the phrase
+                else:
+                    phrase.insert(0, t.text)
+
+            phrase_text = " ".join(phrase)
+            exploitable_phrases.append(phrase_text)
+
+    return exploitable_phrases
+
+
 if __name__ == "__main__":
-    text = "I was eating pizza"
-    is_follows_verb, verb = is_noun_follows_verb(text)
-    if is_follows_verb:
-        print(f"{verb} deez")
-    else:
-        print("No noun follows a verb in the text")
+    # Test the function
+    sentences = [
+        "I am testing for bugs.",
+        "She was speaking at a conference.",
+        "He is looking into the issue.",
+        "They are working on the project.",
+        "Apple is looking at buying U.K. startup for $1 billion",
+    ]
+
+    for sentence in sentences:
+        phrases = find_exploitable_phrases(sentence)
+        for phrase in phrases:
+            print(f"Original phrase: {sentence}")
+            print(f"Exploitable phrase: {phrase} deez")
+            print()