Create footer in word docx by creating a footer.xml file in the word - folder of the docx.zip via python? - python-docx

No idea how I can do this considering the (randomly?) generated rsids in the xml code, anyone has a solution?
from docx.text.run import Run
from docx import Document
doc = Document('/Users/cezi/Desktop/ME.docx')
p = doc.sections[0].footer.paragraphs[0]
for run in p.runs:
if ' ' in run.text:
new_run_element = p._element._new_r()
run._element.addprevious(new_run_element)
new_run = Run(new_run_element, run._parent)
new_run.text = "left"
new_run.add_tab()
new_run.add_text("Page")
p.add_run().add_tab()
p.add_run("right")
doc.save("HOW.docx")

Related

Extracting contents from a PDF to display on web pages

I'm trying to display the contents of the pdf by converting PDF into HTML using Adobe Acrobat 2021, extracting the paragraph structure, and post-processing. I saw a website whose only source is judgments as PDFs from the Supreme Court Website and displays them flawlessly. Does anybody have any idea how it's done?
My current flow is to convert the PDF into HTML to preserve the page layout and extract the text using Beautifulsoup.
Issues I'm currently facing:
Bulletin numbers are somehow dynamically calculated in the PDF and are tagged as
::before
on the browser. bs4 won't recognize it
Miss some paragraphs in between as some paragraphs are detected incorrectly
Table is detected as a table but some imperfections
PDF example : drive link
HTML from Adobe Acrobat : HTML file of the above PDF
This is my goal : Advocatekhoj
This is how accurate I'm expecting it to be.
Could someone please shed light on this? how-to(s) or any suggestions.
Note: I tried various PDF to HTML tools and the Adobe Acrobat was the best in detecting paragraph layout and preserving structure.
from bs4 import BeautifulSoup
from pprint import pprint
from os import listdir
from os.path import isfile, join
mypath = "sup_del_htmls/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
counter = 0
for f in onlyfiles:
print(counter)
with open("output_txt/"+f+".txt", 'w',encoding='utf-8') as txtfile:
with open(mypath+f, encoding='utf-8') as fp:
soup = BeautifulSoup(fp, "html.parser")
para_counter = 1
for li in soup.select("li"):
if li.find_parent("li"):
continue
full_para = ""
for para in li.select("p"):
for match in para.findAll('span'):
match.unwrap()
para_txt = para.get_text().replace("ΒΆ", "")
para_txt = para_txt.strip()
if para_txt.endswith(".") or para_txt.endswith(":") or para_txt.endswith(";") or para_txt.endswith(",") or para_txt.endswith('"') or para_txt.endswith("'"):
full_para += para_txt + "\n"
else:
full_para += para_txt + " "
txtfile.write(full_para)
txtfile.write("\n" + "--sep--" + "\n")
if li.find("table"):
tables = li.find_all("table")
for table in tables:
txtfile.write("--table--"+ "\n")
txtfile.write(str(table) + "\n")
txtfile.write("--sep--" + "\n")
reversed_end = []
for p in reversed(soup.select("p")):
if p.find_parent('li') or p.find_parent('ol'):
break
reversed_end.append(" ".join(p.text.split()))
if reversed_end!=[]:
for final_end in reversed(reversed_end):
txtfile.write(final_end + "\n")
txtfile.write("--sep--" + "\n")
The Result : output.txt
For the numbering with :before in css, you can try to extract the selector/s for the numbered items with a function like this
def getLctrSelectors(stsh):
stsh = stsh.get_text() if stsh else ''
ll_ids = list(set([
l.replace('>li', '> li').split('> li')[0].strip()
for l in stsh.splitlines() if l.strip()[:1] == '#'
and '> li' in l.replace('>li', '> li') and
'counter-increment' in l.split('{')[-1].split(':')[0]
]))
for i, l in enumerate(ll_ids):
sel = f'{l} > li > *:first-child'
ll_ids[i] = (sel, 1)
crl = [
ll for ll in stsh.splitlines() if ll.strip().startswith(l)
and 'counter-reset' in ll.split('{')[-1].split(':')[-2:][0]
][:1]
if not crl: continue
crl = crl[0].split('{')[-1].split('counter-reset')[-1].split(':')[-1]
crl = [w for w in crl.split(';')[0].split() if w.isdigit()]
ll_ids[i] = (sel, int(crl[-1]) if crl else 1)
return ll_ids
(It should take a style tag as input and return a list of selectors and starting counts - like [('#l1 > li > *:first-child', 3)] for your sample html.)
You can use it in your code to insert the numbers into the text in the bs4 tree:
soup = BeautifulSoup(fp, "html.parser")
for sel, ctStart in getLctrSelectors(soup.select_one('style')):
for i, lif in enumerate(soup.select(sel)):
lif.insert(0, f'{i + ctStart}. ')
para_counter = 1
### REST OF CODE ###
I'm not sure I can help you with paragraphs and tables issues... Are you sure the site uses the same pdfs as you have? (Or that they use pdfs at all rather than something closer to the original/raw data?) Your pdf itself looked rather different from its corresponding page on the site.

How can I update a csv file through a code which constitutes of creating a folder holding the respective csv file without facing FileExistsError?

I have made a code of creating a folder that shall contain the output of the same code in a csv file. But when I wish to make amendments to the code so as to modify the output obtained in the csv file, I do not wish to run into FileExistsError. Is there any way I can do that? Sorry if the query is a foolish one, as I am just beginning to learn Python. Here's my code:
path = Path('file\location\DummyFolder')
path.mkdir(parents=True)
fpath = (path / 'example').with_suffix('.csv')
colours = ['red','blue', 'green', 'yellow']
colour_count = 0
with fpath.open(mode='w', newline='') as csvfile:
fieldnames = ['number', 'colour']
thewriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
thewriter.writeheader()
for colour in colours:
colour_count+=1
thewriter.writerow({'number':colour_count, 'colour':colour})

Writing prints in a txt file

new some help here. I need the printed text in a .txt file instead of showing in run.
Cant seem to figure this thing out...
import random
import string
import itertools
def randomString(stringLength=3):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength))
num = 10
for i in itertools.repeat(num):
print(randomString(3) + str(random.randint(10, 99)) + (randomString(3)))

File corruption when inserting images to an existing word file

The following code applied to an existing file works for 2 images but beyond the file is marked as corrupted (though it can be recovered perfectly in word):
import docx
docTemplate = "TestTemplate.docx"
# docx job: add test subsections + images
doc_docx = docx.Document(docTemplate)
#doc_docx = docx.Document()
p = doc_docx.add_paragraph()
wp = p.add_run()
wp.add_picture('image.png')
wp.add_break()
wp.add_picture('image.png')
wp.add_break()
wp.add_picture('image.png')
doc_docx.save('TestFile2.docx')
The content of doc_docx.part.blob is available on pastebin
It appears that the docTemplate document was already containing some objects. Thanks to the answer from #Tores76 on python-docx github I could solve the "file corruption" issue. This means it was likely due to a duplicate docPr id
# fix id
doc_element = doc_docx._part._element
docPrs = doc_element.findall('.//' + qn('wp:docPr'))
for docPr in docPrs:
docPr.set('id',str(int(docPr.get('id'))+100000))

Parsing HTML and writing PDF to disk (python)

My goal was to write a script that downloads all the pdf files from a user entered site.
Problem 1. the code does not return the anchor tags located inside of the iframe. I tried explicitly using the iframe tag name and then using .contents but the commanded returns an empty list.
Question 1: How to parse the iframe? Why doesn't the iframe.contents return its children i.e. the <a> tags?
Problem 2: Writing the PDFs to disk appears successful however when I attempt to the files I get the following error,
"....could not open...because it is either not a supported file type
or because the file has been damaged ( for example, it was sent as an
email...and wasn't correctly decoded).
Question 2: Anybody encounter this before?
The code is split in two blocks; one for each problem delete the set of quotes around a block to run.
Lastly if anyone can explain why the two urls don't match in the first block of code that would be awesome. Code is commented; contains urls for each question. Thanks!
PYTHON CODE
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
#initializing counters
slide = 1
count = 0
#ignore SSL cert errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#get user url and create soup object
url = input("Enter the website name: ")
connect = urllib.request.urlopen(url, context=ctx)
soup = BeautifulSoup(connect, 'html.parser')
######## code block for question 1 revolving around parsing iframes and the issues with the
######## mismatching urls
#url used for code block 1: https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html
"""
#trying to retrieve all anchor tags; doesn't print the anchor tags within the iframe
tags = soup('a')
for tag in tags:
print(tag)
print('\n')
#explictly asking for the iframe tag
iframe = soup.iframe
#the url printed on this line doesn't match the url printed once I get the src attribute
#navigating to the url listed here is what I use for the second block of code because it
#isn't an iframe
print(iframe)
iframe_src_url = iframe['src']
#this url doesn't match the one shown in the previous print statement and it leaves you dealing
#with another iframe
print(iframe_src_url)
"""
#########code block for question 2 where I enter the url found in the iframe src attribute
#url for block 2: https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false
"""
tags = soup('a')
#iterate through tags, retrieve href addresses, navigate to the document, write data to file
for tag in tags:
doc_url = tag.get('href')
file = urllib.request.urlopen(doc_url, context=ctx)
file = open("Week " + str(slide) + " slides.pdf", 'wb')
file.write(connect.read())
file.close()
print("Finished file: ", slide)
count = count + 1
slide = slide + 1
print("Total files downloaded: ", count)"""
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html')
soup = BeautifulSoup(r.content, 'html.parser')
for item in soup.findAll('iframe'):
print(item.get('src'))
Output:
https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false
And Regarding the second question:
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false')
soup = BeautifulSoup(r.content, 'html.parser')
links = []
for item in soup.findAll('a', {'rel': 'noreferrer'}):
links.append(item.get('href'))
for item in links:
r = requests.get(item)
source = r.headers.get('Location')
print(f"Saving File {source[56:]}")
r1 = requests.get(source)
with open(f"{source[56:]}", 'wb') as f:
f.write(r1.content)
print(f"\nTotal File Downloaded is {len(links)}")
Output will save the file to your local disck:
Saving File 01-intro-logistics.pdf
Saving File 02-data.pdf
Saving File 03-preprocessing.pdf
Saving File 03-preprocessing.pdf
Saving File 04-frequent-patterns.pdf
Saving File 05a-supervised.pdf
Saving File 05b-supervised.pdf
Saving File 05c-supervised.pdf
Saving File 06a-supervised-advanced.pdf
Saving File 06b-supervised-advanced.pdf
Saving File 07a-unsupervised.pdf
Saving File 07b-unsupervised.pdf
Saving File 07c-advanced-unsupervised.pdf
Saving File 08-graph-mining.pdf
Saving File 09-anomaly-detection.pdf
Saving File 10-time-series.pdf
Total File Downloaded is 16
Full Version:
import requests
from bs4 import BeautifulSoup
import html
def Get_Links():
links = set()
r = requests.get(
'https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html')
soup = BeautifulSoup(r.text, 'html.parser')
source = html.escape(soup.find('iframe').get('src'))
r = requests.get(source)
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('a', {'rel': 'noreferrer'}):
links.add(item.get('href'))
return links, len(links)
def Save_Items():
items, size = Get_Links()
for item in items:
r = requests.get(item)
source = r.headers.get('Location')
print(f"Saving File {source[56:]}")
r = requests.get(source)
with open(f"{source[56:]}", 'wb') as f:
f.write(r.content)
print(f"\nTotal File Downloaded is {size}")
Save_Items()