Feb-26-2024, 08:16 PM
I have the below code which scrapes this page: https://www.eeoc.gov/newsroom/search.
It works well but I also want it to open each url and scrape the full text on the page for each. Any suggestions on how to modify this code to achieve?
It works well but I also want it to open each url and scrape the full text on the page for each. Any suggestions on how to modify this code to achieve?
import csv import requests from bs4 import BeautifulSoup def scrape_eec_news(): base_url = "https://www.eeoc.gov/newsroom/search?page=" results = [] page_number = 0 while True: page_number += 1 url = base_url + str(page_number) response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") entries = soup.find_all("div", class_="views-row") if not entries: break print("Scraping page", page_number) # Print the page number for entry in entries: title_elem = entry.h2 description_elem = entry.p date_elem = entry.find("div", class_="field--type-datetime") url_elem = entry.a title = title_elem.text.strip() description = description_elem.text.strip() if description_elem else "" date = date_elem.text.strip() if date_elem else "" # Check if date_elem is not None url = url_elem["href"] # Add the 'agency' column with the value "United States Equal Employment Opportunity Commission" results.append( { "title": title, "description": description, "date": date, "url": url, "agency": "United States Equal Employment Opportunity Commission" } ) return results def export_to_csv(data, filename): with open(filename, "w", newline="", encoding="utf-8") as csvfile: fieldnames = ["title", "description", "date", "url", "agency"] # Include 'agency' in the fieldnames writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for entry in data: writer.writerow(entry) if __name__ == "__main__": news_entries = scrape_eec_news() export_to_csv(news_entries, "eec_news.csv") print("Data exported to eec_news.csv")