Python Scrapy Date Extraction Issue - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Python Scrapy Date Extraction Issue (/thread-28829.html) |
Python Scrapy Date Extraction Issue - tr8585 - Aug-05-2020 I am struggling with how to properly extract an oddly formatted date that exists on the page in one location using Python Scrapy. What needs to be changed to have the date included on every output row in yyyy-mm-dd format? Problematic code lines: data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-") date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime("%Y-%m-%d")Sample output appears to contain one character for date. Example: {'match_id': '1893065', 'date': '0'} Here is the full spider. import scrapy import datetime import re from datetime import timedelta class Tennis_ExplorerSpider(scrapy.Spider): name = 'tennis_explorer' allowed_domains = ['tennisexplorer.com'] def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) start_date = datetime.datetime.today() - datetime.timedelta(days=1) end_date = datetime.datetime.today() + datetime.timedelta(days=1) start_urls = [] start_url='https://www.tennisexplorer.com/matches/?type=all&year=' for single_date in daterange(start_date, end_date): start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6")) def parse(self, response): #Extracting the content using xpath self.logger.debug('callback "parse": got response %r' % response) data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract() match_id =[re.sub('^.+=','',el) for el in data] data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-") date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime("%Y-%m-%d") #Give the extracted content row wise for item in zip(match_id, date): #create a dictionary to store the scraped info scraped_info = { 'match_id' : item[0], 'date' : item[1] } #yield or give the scraped info to scrapy yield scraped_info RE: Python Scrapy Date Extraction Issue - tr8585 - Aug-05-2020 I updated the code after receiving advice from a nice fellow on stackoverflow. There still seems to be a problem even though I receive no errors. The date is still only appearing once. Updated Code: import scrapy import datetime import re from datetime import timedelta class Tennis_ExplorerSpider(scrapy.Spider): name = 'tennis_explorer' allowed_domains = ['tennisexplorer.com'] def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) start_date = datetime.datetime.today() - datetime.timedelta(days=1) end_date = datetime.datetime.today() + datetime.timedelta(days=1) start_urls = [] start_url='https://www.tennisexplorer.com/matches/?type=all&year=' for single_date in daterange(start_date, end_date): start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6")) def parse(self, response): #Extracting the content using xpath self.logger.debug('callback "parse": got response %r' % response) data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract() match_id =[re.sub('^.+=','',el) for el in data] data2 = "01. 01. 2008" date = re.sub("^(\d+).+?(\d+).+(\d{4})$",'\g<3>-\g<2>-\g<1>',data2) nbel = len(response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]')).extract() dates = [date]*nbel #Give the extracted content row wise for item in zip(match_id, dates): #create a dictionary to store the scraped info scraped_info = { 'match_id' : item[0], 'dates' : item[1] } #yield or give the scraped info to scrapy yield scraped_infoUpdated Output: Quote:{'match_id': ['1893065', '1893066', '1893061', '1893059', '1893062', '1893067', '1893063', '1893064', '1893130', '1893133', '1893134', '1893117', '1893045', '1893046', '1893047', '1893048', '1893158', '1893105', '1893106', '1893108', '1893107', '1893109', '1893110', '1893053', '1893054', '1893055', '1893055', '1893056', '1893056', '1893057', '1893058', '1893058', '1893139', '1893113', '1893114', '1893115', '1893116', '1893040', '1893040', '1893039', '1893037', '1893036', '1893038', '1892792', '1892792', '1892802', '1892802', '1892794', '1892794', '1893068', '1893078', '1893073', '1893077', '1893074', '1893069', '1893075', '1893070', '1893079', '1893071', '1893076', '1893118', '1893084', '1893080', '1893081', '1893082', '1893083', '1893085', '1893121', '1893122', '1893123', '1893124', '1893125', '1893128', '1893126', '1893127', '1893049', '1893050', '1893051', '1893052', '1893111', '1893140', '1893141', '1893142', '1893143', '1893144', '1892935', '1892934', '1892930', '1892931'], 'date': '04-08-2020' |