Ok - I like the idea of going back to the Larz60+ file. The problem is it only gets me half of what I need.
self.homepath = Path('.')
self.completionspath = self.homepath / 'comppdf'
self.completionspath.mkdir(exist_ok=True)
self.geocorepdf = self.homepath / 'geocorepdf'
self.geocorepdf.mkdir(exist_ok=True)
self.textpath = self.homepath / 'text'
self.text.mkdir(exist_ok=True)
While this is only creating folders - it will hold completion reports for oil and gas wells. For example:
http://wogcc.state.wy.us/legacywogcce.cfm and click on 'Wells', then click on 'By API Number' and then enter 2521203 in the space provided. Up on the top right are 'Completions' and 'Cores/Pressures/Reports'. These are the two places I need reports from.
Larz60+'s file is the one that got me where I am right now. I'm new to this and he has helped me more than most can possibly imagine. That said - this is where we are.
import requests
from bs4 import BeautifulSoup
from pathlib import Path
class GetCompletions:
def __init__(self, infile):
"""Above will create a folder called comppdf, and geocorepdf wherever the WOGCC
File Downloads file is run from as well as a text file for my api file to
reside.
"""
self.homepath = Path('.')
self.completionspath = self.homepath / 'comppdf'
self.completionspath.mkdir(exist_ok=True)
self.geocorepdf = self.homepath / 'geocorepdf'
self.geocorepdf.mkdir(exist_ok=True)
self.textpath = self.homepath / 'text'
self.textpath.mkdir(exist_ok=True)
self.infile = self.textpath / infile
self.api = []
self.parse_and_save(getpdfs=True)
def get_url(self):
for entry in self.apis:
yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI=[]".format(entry[3:10]))
yield (entry, "http://wogcc.state.wy.us/whatupcores.cfm?autonum=[]".format(entry[3:10]))
"""Above will get the URL that matches my API numbers."""
def parse_and_save(self, getpdfs=False):
for file in filelist:
with file.open('r') as f:
soup = BeautifulSoup(f.read(), 'lxml')
if getpdfs:
links = soup.find_all('a')
for link in links:
url in link['href']
if 'www' in url:
continue
print('downloading pdf at: {}'.format(url))
p = url.index('=')
response = requests.get(url, stream=True, allow_redirects=False)
if response.status_code == 200:
try:
header_info = response.headers['Content-Disposition']
idx = header_info.index('filename')
filename = self.log_pdfpath / header[idx+9:]
except ValueError:
filename = self.log_pdfpath / 'comp{}'.format(url[p+1:])
print("couldn't locate filename for {} will use: {}".format(file, filename))
except KeyError:
filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
print('got KeyError on {}, respnse.headers = {}'.format(file, response.headers))
print('will use name: {}'.format(filename))
print(repsonse.headers)
with filename.open('wb') as f:
f.write(respnse.content)
self.parse_and_save(getpdfs=True)
sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
tds = soup.find_all('td')
with sfname.open('w') as f:
for td in tds:
if td.text:
if any(field in td.text for field in self.fields):
f.write('{}\n'.format(td.text))
if __name__ == '__main__':
GetCompletions('api.txt')
Ok - errors help me too!
Error:
RESTART: C:\Users\toliver\AppData\Local\Programs\Python\Python36\WOGCC\WOGCC_File_Downloads test 2.py
Traceback (most recent call last):
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\WOGCC\WOGCC_File_Downloads test 2.py", line 73, in <module>
GetCompletions('api.txt')
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\WOGCC\WOGCC_File_Downloads test 2.py", line 22, in __init__
self.parse_and_save(getpdfs=True)
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\WOGCC\WOGCC_File_Downloads test 2.py", line 34, in parse_and_save
for file in filelist:
NameError: name 'filelist' is not defined
I hope this all makes sense. I really appreciate your help!