Aug-05-2020, 04:32 AM
I updated the code after receiving advice from a nice fellow on stackoverflow. There still seems to be a problem even though I receive no errors. The date is still only appearing once.
Updated Code:
Updated Code:
import scrapy import datetime import re from datetime import timedelta class Tennis_ExplorerSpider(scrapy.Spider): name = 'tennis_explorer' allowed_domains = ['tennisexplorer.com'] def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) start_date = datetime.datetime.today() - datetime.timedelta(days=1) end_date = datetime.datetime.today() + datetime.timedelta(days=1) start_urls = [] start_url='https://www.tennisexplorer.com/matches/?type=all&year=' for single_date in daterange(start_date, end_date): start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6")) def parse(self, response): #Extracting the content using xpath self.logger.debug('callback "parse": got response %r' % response) data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract() match_id =[re.sub('^.+=','',el) for el in data] data2 = "01. 01. 2008" date = re.sub("^(\d+).+?(\d+).+(\d{4})$",'\g<3>-\g<2>-\g<1>',data2) nbel = len(response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]')).extract() dates = [date]*nbel #Give the extracted content row wise for item in zip(match_id, dates): #create a dictionary to store the scraped info scraped_info = { 'match_id' : item[0], 'dates' : item[1] } #yield or give the scraped info to scrapy yield scraped_infoUpdated Output:
Quote:{'match_id': ['1893065', '1893066', '1893061', '1893059', '1893062', '1893067', '1893063', '1893064', '1893130', '1893133', '1893134', '1893117', '1893045', '1893046', '1893047', '1893048', '1893158', '1893105', '1893106', '1893108', '1893107', '1893109', '1893110', '1893053', '1893054', '1893055', '1893055', '1893056', '1893056', '1893057', '1893058', '1893058', '1893139', '1893113', '1893114', '1893115', '1893116', '1893040', '1893040', '1893039', '1893037', '1893036', '1893038', '1892792', '1892792', '1892802', '1892802', '1892794', '1892794', '1893068', '1893078', '1893073', '1893077', '1893074', '1893069', '1893075', '1893070', '1893079', '1893071', '1893076', '1893118', '1893084', '1893080', '1893081', '1893082', '1893083', '1893085', '1893121', '1893122', '1893123', '1893124', '1893125', '1893128', '1893126', '1893127', '1893049', '1893050', '1893051', '1893052', '1893111', '1893140', '1893141', '1893142', '1893143', '1893144', '1892935', '1892934', '1892930', '1892931'], 'date': '04-08-2020'