from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import re

domains = []
subdomains = []
run_once = 0

print("Please enter a domain: ")
print("Format: (https://www.your-domain.com/)\n")
domains.append(str(input()))

print("\nPlease enter the name of your wordlist: \n")
word_list_name = input(str())

myfile = open("full_html.txt", "w")

for x in domains:
    try:
        url = x
        html = urlopen(url).read()
        #main_html = BeautifulSoup(html, features="html.parser").decode('utf-8', 'ignore')
        main_html = BeautifulSoup(html, features="html.parser")
        
        if run_once == 0:
            for link in main_html.find_all("a"):
                subdomains.append(link.get("href"))
            domains.extend(subdomains)
        run_once = 1

        myfile.write("%s\n" % main_html) 

        print("got source code!\n")
    except:
        pass

with open("full_html.txt", "r") as file:
    full_html = file.read().replace("\n", "")
    full_html_soup = BeautifulSoup(full_html, features="html.parser")


# kill all script and style elements
for script in full_html_soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = full_html_soup.get_text()

bytes(text, 'utf-8').decode('utf-8','ignore')

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

# remove urls from text
text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '', text, flags=re.MULTILINE)

sonderzeichen = [",","...","!","?",".","[","]","{","}","|","#","&","*","/",":",";","+","-","_","=","<",">"]

word_list = text.split()
for elem in list(word_list):
    for x in sonderzeichen:
        if elem == x:
            word_list.remove(elem)

#Reomve duplicats from list
word_list = list(dict.fromkeys(word_list))

word_list = [
    word[:-1] if word[-1] in sonderzeichen else word
    for word in word_list
]

with open(word_list_name, "w") as f:
        for elem in list(word_list):
            f.write("%s\n" % elem)

print("Successfully created: " + word_list_name)

myfile.close()

os.remove("full_html.txt")