Beautiful soup opens python command line and nothing happens - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Beautiful soup opens python command line and nothing happens (/thread-4233.html) |
Beautiful soup opens python command line and nothing happens - Prince_Bhatia - Aug-01-2017 i am new to programming , i have created a webscraper in python using beautiful soup but when i run this program it opens python command line and and just cursor blink on it and nothing happens...please dont mind the indentation,can someone please tell me what is happeing here, i am using python 3.6 import urllib.request import urllib import json import xml.etree.ElementTree as ET import csv from bs4 import BeautifulSoup link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist' talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka" distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict" prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName" alldata = [] links = {} certificatedata = [] def getData(url, values): data = urllib.parse.urlencode(values) data = data.encode('utf-8') req = urllib.request.Request(url, data) response=urllib.request.urlopen(req) data = response.read() data = data.decode("utf-8") return data def getDivsion(): ## for now we are taking 6 districts.. it needs to updated when the data gets updatedd return range(1,7) def getDistrict(divId): global distlink values = {'DivID': divId} data = getData(distlink, values) return data def parseJson(data): parsed = json.loads(data) return parsed def getTaluka(disId): global talukaLink values= {'DisID': disId} data = getData(talukaLink, values) return data def getProjects(divId, disId): global prjLink values= {'DisID': disId, 'DivID': divId} #print(values) data = getData( prjLink, values) if len(data)<10: return "{}" return data def getProjectsList(): divList = getDivsion() flag = 0 for divId in divList: disData = getDistrict(divId) disList = parseJson(disData) for disObj in disList: disId = disObj["ID"] prjData = getProjects(divId, disId) #print(" >>>> "+str(disId)+" >> "+str(divId)) #print(prjData) prjJson = parseJson(prjData) for prjObj in prjJson: flag += 1 prjId = prjObj["ID"] values = {'ID':0, 'pageTraverse': 1, 'Division': divId, 'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'} finalPrjData = getData(link, values) parseXMLData(finalPrjData) #if len(alldata)>100: # break def parseXMLData(htmldata): global alldata, links soup = BeautifulSoup(htmldata, "html.parser") tables = soup.find_all("table") for table in tables: attr = table.attrs if "table" in attr['class']: tbody = table.find_all("tbody") if len(tbody)>0: tbody = tbody[0] tr_lst = tbody.find_all("tr") for tr in tr_lst: sublist = [] td_lst = tr.find_all("td") if len(td_lst)>6: prjname = td_lst[1].text proname = td_lst[2].text certNo = td_lst[3].text sublist.append(prjname) sublist.append(proname) sublist.append(certNo) td = td_lst[4] a_lst = td.find_all("a") if len(a_lst)>0: a = a_lst[0] href = a.attrs['href'] link = "https://maharerait.mahaonline.gov.in/"+href links[certNo] = link sublist.append(link) if len(sublist)>0: alldata.append(sublist) return alldata def writedata(alldata1, filename): print(" >>>> FINAL PRINTING DATA >>>> ") #import pdb; pdb.set_trace() with open("./"+filename,'w') as csvfile: csvfile = csv.writer(csvfile, delimiter=',') #csvfile.writerow(titleRow) csvfile.writerow("") for i in range(0, len( alldata1 )): #print(alldata1[i]) csvfile.writerow( alldata1[i] ) def processlinksforcert(): global links, certificatedata print(">> Came in fetching certificates data >>> " ) for certno in links.keys(): link = links[certno] htmldata = getData(link, {}) soup = BeautifulSoup(htmldata, "html.parser") divs = soup.find_all("div") for div in divs: attr = div.attrs if "id" in attr.keys() and "DivProfessional" in attr['id']: table = div.find_all("table") if len(table)<=0: continue t_attr = table[0].attrs if "table" in t_attr["class"]: table = table[0] tr_lst = table.find_all("tr") index = 1 while index<len(tr_lst): #import pdb; pdb.set_trace() #for tr in tr_lst: #if index==0: # continue tr = tr_lst[index] index += 1 sublist = [] td_lst = tr.find_all("td") if len(td_lst)>2: sublist.append(certno) pername = formattext( td_lst[0].text) cerno = formattext( td_lst[1].text ) proftype = formattext( td_lst[2].text ) sublist.append(pername) sublist.append(cerno) sublist.append(proftype) certificatedata.append(sublist) return certificatedata def formattext(text): while text.find("\r\n")>=0: text = text.replace("\r\n","") while text.find(" ")>=0: text = text.replace(" ","") return text def main(): global alldata, certificatedata #data = getData(url, {}) getProjectsList() writedata(alldata, "data.csv") data = processlinksforcert() writedata( data, "certificates.csv" ) main() RE: Beautiful soup opens python command line and nothing happens - Larz60+ - Aug-01-2017 suggest you read the following. You should use requests rather than urllib, and these explain it's use with beautifulsoup best web scraping part1 web scraping part2 RE: Beautiful soup opens python command line and nothing happens - Prince_Bhatia - Aug-01-2017 (Aug-01-2017, 11:20 AM)Larz60+ Wrote: suggest you read the following. now i received these errors: TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host RE: Beautiful soup opens python command line and nothing happens - Larz60+ - Aug-01-2017 you should do the tutorials RE: Beautiful soup opens python command line and nothing happens - Prince_Bhatia - Aug-01-2017 may be i should use import request, a try..Thank you so much for you help |