scraping from multiple url - html

import requests
from bs4 import BeautifulSoup
import pandas as pd
#scraping data
page=requests.get('https://www.sec.gov/Archives/edgar/data/11860/0000011860-00-000025.txt')
soup=BeautifulSoup(page.content,'html.parser')
data_1=list(soup.children)[8]
main_data=list(data_1.children)[1].get_text()
#number of words
num=len(main_data.split())
this is my successful code to calculate total number of words from a single URL. now the challenge is to calculate number of words from a csv file which has got 500 urls in a column. i tried a lot but failed.

For parsing csv, there's the csv library, so you'd simply need:
import csv
with open('yourfile.csv', 'rb') as csvfile:
reader = csv.reader(csvfile) # add delimiter ='',quotechar ='' depending on csv structure
for URL in reader:
page = requests.get(URL)
#the rest of what you want to do with the content of URL
You could put your current code into a function and then call it for each parsed URL.

Related

Python: Creating PDF from PNG images and CSV tables using reportlab

I am trying to create a PDF document using a series of PDF images and a series of CSV tables using the python package reportlab. The tables are giving me a little bit of grief.
This is my code so far:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate
from reportlab.pdfgen.canvas import Canvas
from reportlab.platypus import *
from reportlab.platypus.tables import Table
from PIL import Image
from matplotlib.backends.backend_pdf import PdfPages
# Set the path to the folder containing the images and tables
folder_path = 'Files'
# Create a new PDF document
pdf_filename = 'testassessment.pdf'
canvas = Canvas(pdf_filename)
# Iterate through the files in the folder
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
# If the file is an image, draw it on the PDF
if file.endswith('.png'):
canvas.drawImage(file_path, 105, 148.5, width=450, height=400)
canvas.showPage() #ends page
# If the file is a table, draw it on the PDF
elif file.endswith('.csv'):
df = pd.read_csv(file_path)
table = df.to_html()
canvas.drawString(10, 10, table)
canvas.showPage()
# Save the PDF
canvas.save()
The tables are not working. When I use .drawString it ends up looking like this:
Does anyone know how I can get the table to be properly inserted into the PDF?
According to the reportlab docs, page 14, "The draw string methods draw single lines of text on the canvas.". You might want to have a look at "The text object methods" on the same page.
You might want to consider using PyMuPDF with Stories it allows for more flexibility of layout from a data input. For an example of something very similar to what you are trying to achieve see: https://pymupdf.readthedocs.io/en/latest/recipes-stories.html#how-to-display-a-list-from-json-data

Splitting sentences from a .txt file to .csv using NLTK

I have a corpus of newspaper articles in a .txt file, and I'm trying to split the sentences from it to a .csv in order to annotate each sentence.
I was told to use NLTK for this purpose, and I found the following code for sentence splitting:
import nltk
from nltk.tokenize import sent_tokenize
sent_tokenize("Here is my first sentence. And that's a second one.")
However, I'm wondering:
How does one use a .txt file as an input for the tokenizer (so that I don't have to just copy and paste everything), and
How does one output a .csv file instead of just printing the sentences in my terminal.
Reading a .txt file & tokenizing its sentences
Assuming the .txt file is located in the same folder as your Python script, you can read a .txt file and tokenize the sentences using NLTK as shown below:
from nltk import sent_tokenize
with open("myfile.txt") as file:
textFile = file.read()
tokenTextList = sent_tokenize(textFile)
print(tokenTextList)
# Output: ['Here is my first sentence.', "And that's a second one."]
Writing a list of sentence tokens to .csv file
There are a number of options for writing a .csv file. Pick whichever is more convenient (e.g. if you already have pandas loaded, use the pandas option).
To write a .csv file using the pandas module:
import pandas as pd
df = pd.DataFrame(tokenTextList)
df.to_csv("myCSVfile.csv", index=False, header=False)
To write a .csv file using the numpy module:
import numpy as np
np.savetxt("myCSVfile.csv", tokenTextList, delimiter=",", fmt="%s")
To write a .csv file using the csv module:
import csv
with open('myCSVfile.csv', 'w', newline='') as file:
write = csv.writer(file, lineterminator='\n')
# write.writerows([tokenTextList])
write.writerows([[token] for token in tokenTextList]) # For pandas style output

Extract data from website that has load more function and save all the data into csv

I have a Python script where I extract data from a website
https://www.ema.europa.eu/en/search/search/field_ema_web_topics%253Aname_field/Scientific%20guidelines/field_ema_web_categories%253Aname_field/Human?sort=field_ema_computed_date_field&order=desc
and save only the "ecl-list-item__title ecl-heading" and "small" into excel for all 670 results.
Currently, I have a script that saves the collected information in an excel file after looping the entire URLs available in excel, but no results appended. Please find below my script which I'm currently bugged. Please help me with changes to make it work.
import requests
# Import required modules
import bs4
# Import required module
from bs4 import BeautifulSoup
# Import required module
import pandas as pd
# Input Web URL
URL = "https://www.ema.europa.eu/en/search/search/field_ema_web_topics%253Aname_field/Scientific%20guidelines/field_ema_web_categories%253Aname_field/Human?sort=field_ema_computed_date_field&order=desc"
result = requests.get(URL)
# Creating soap object
soup = bs4.BeautifulSoup(result.text,'lxml')
# Searching div tags having maincounter-number class
cases = soup.find_all('div' ,class_= 'view view-search-solr-sitewide-search view-id-search_solr_sitewide_search view-display-id-ema_sitewide_search view-dom-id-99ccfcd90732eb90b270257a1c29fd39 jquery-once-1-processed')
data = []
# Get data from it
for i in cases:
span = i.find('div' ,class_= 'ecl-list-item__title ecl-heading')
data.append(span.text)
# Display number of cases
print(data)```
Please let me know if you need further clarification.
Thanks

How to convert multiple nested JSON files into single CSV file using python?

I have about 200 nested JSON files with varying levels of nesting from one to three. Each JSON file consist of more than thousand data points. The keys of the values are same in all the files. My objective is to combine the data in all the files in a tabular format in a single CSV file so that I can read all the data and analyze it. I am looking for a simpler python code with brief explanation of each steps of the code to help in understanding the whole sequence of the code.
You can use this code snippet.
First of all install pandas using
pip install pandas
After that, you can use this code to convert JSON files to CSV.
# code to save all data to a single file
import pandas as pd
import glob
path = './path to directory/*.json'
files = glob.glob(path)
data_frames = []
for file in files:
f = open(file, 'r')
data_frames.append(pd.read_json(f))
f.close()
pd.concat(data_frames).to_csv("data.csv")
# code to save CSV data to individual files
import pandas as pd
import glob
path = './path to directory/*.json'
files = glob.glob(path)
for file in files:
f = open(file, 'r')
jsonData = pd.read_json(f.read())
jsonData.to_csv(f.name+".csv")
f.close()

Scrape for Absolute URL with html.parse and remove duplicates

I am trying to make sure that the relative links are saved as absolute links into this CSV. (URL parse) I am also trying to remove duplicates, which is why I created the variable "ddupe".
I keep getting all the relative URLs saved when I open the csv in the desktop.
Can someone please help me figure this out? I thought about calling the "set" just like this page: How do you remove duplicates from a list whilst preserving order?
#Importing the request library to make HTTP requests
#Importing the bs4 library to extract / parse html and xml files
#utlize urlparse to change relative URL to absolute URL
#import csv (built in package) to read / write to Microsoft Excel
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import csv
#create the page variable
#associate page to request to obtain the information from raw_html
#store the html information in a text
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
parsed = urlparse(page)
raw_html = page.text # declare the raw_html variable
soup = BeautifulSoup(raw_html, 'html.parser') # parse the html
#remove duplicate htmls
ddupe = open(‘page.text’, ‘r’).readlines()
ddupe_set = set(ddupe)
out = open(‘page.text’, ‘w’)
for ddupe in ddupe_set:
out.write(ddupe)
T = [["US Census Bureau Links"]] #Title
#Finds all the links
links = map(lambda link: link['href'], soup.find_all('a', href=True))
with open("US_Census_Bureau_links.csv","w",newline="") as f:
cw=csv.writer(f, quoting=csv.QUOTE_ALL) #Create a file handle for csv writer
cw.writerows(T) #Creates the Title
for link in links: #Parses the links in the csv
cw.writerow([link])
f.close() #closes the program
The function you're looking for is urljoin, not urlparse (both from the same package urllib.parse). It should be used somewhere after this line:
links = map(lambda link: link['href'], soup.find_all('a', href=True))
Use a list comprehension or map + lambda like you did here to join the relative URLs with base paths.