How to align a text of cell in a table to center using python-docx? - python-docx

I searched a lot but i am not able to get through it. If anybody has experience of python-docx can help be in few thing as below.
My code as under:
from docx import Document
from docx.shared import Cm, Inches
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.table import WD_ALIGN_VERTICAL
from docx.enum.section import WD_ORIENT
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_ALIGN_PARAGRAPH
chart = Document()
style = chart.styles['Normal']
font = style.font
from docx.shared import Pt
font.size = Pt(12)
font.underline = False
# Table Starts
print("Table Making Starts..")
table = chart.add_table(rows=1, cols=2, style='Table Grid')
# Adding Table Heading Text
cell0 = table.cell(0, 0)
cell0.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
cell0.bold = True
cell0.text = "Column1"
cell1 = table.cell(0, 1)
cell1.bold = True
cell1.text = "Column2"
cell1.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
chart.save('./test7.docx')
But the problem is that it is not doing formatting to align the text to center.
My Code Output:
Expected Output:

Related

How to scrape only texts from specific HTML elements?

I have a problem with selecting the appropriate items from the list.
For example - I want to omit "1." then the first "5" (as in the example)
Additionally, I would like to write a condition that the letter "W" should be changed to "WIN".
import re
from selenium import webdriver
from bs4 import BeautifulSoup as BS2
from time import sleep
driver = webdriver.Chrome()
driver.get("https://www.flashscore.pl/druzyna/ajax/8UOvIwnb/tabela/")
sleep(10)
page = driver.page_source
soup = BS2(page,'html.parser')
content = soup.find('div',{'class':'ui-table__body'})
content_list = content.find_all('span',{"table__cell table__cell--value"})
res = []
for i in content:
line = i.text.split()[0]
if re.search('Ajax', line):
res.append(line)
print(res)
results
['1.Ajax550016:315?WWWWW']
I need
Ajax;5;5;0;16;3;W;W;W;W;W
I would recommend to select your elements more specific:
for e in soup.select('.ui-table__row'):
Iterate the ResultSet and decompose() unwanted tag:
e.select_one('.wld--tbd').decompose()
Extract texts with stripped_strings and join() them to your expected string:
data.append(';'.join(e.stripped_strings))
Example
Also making some replacements, based on dict just to demonstrate how this would work, not knowing R or P.
...
soup = BS2(page,'html.parser')
data = []
for e in soup.select('.ui-table__row'):
e.select_one('.wld--tbd').decompose()
e.select_one('.tableCellRank').decompose()
e.select_one('.table__cell--points').decompose()
e.select_one('.table__cell--score').string = ';'.join(e.select_one('.table__cell--score').text.split(':'))
pattern = {'W':'WIN','R':'RRR','P':'PPP'}
data.append(';'.join([pattern.get(i,i) for i in e.stripped_strings]))
data
To only get result for Ajax:
data = []
for e in soup.select('.ui-table__row:-soup-contains("Ajax")'):
e.select_one('.wld--tbd').decompose()
e.select_one('.tableCellRank').decompose()
e.select_one('.table__cell--points').decompose()
e.select_one('.table__cell--score').string = ';'.join(e.select_one('.table__cell--score').text.split(':'))
pattern = {'W':'WIN','R':'RRR','P':'PPP'}
data.append(';'.join([pattern.get(i,i) for i in e.stripped_strings]))
data
Output
Based on actually data it may differ from questions example.
['Ajax;6;6;0;0;21;3;WIN;WIN;WIN;WIN;WIN']
you had the right start by using bs4 to find the table div, but then you gave up and just tried to use re to extract from the text. as you can see that's not going to work. Here is a simple way to hack and get what you want. I keep grabinn divs from the table div you find, and the grab the text of the next eight divs after finding Ajax. then I do some dirty string manipulation thing because the WWWWW is all in the same toplevel div.
import re
from selenium import webdriver
from bs4 import BeautifulSoup as BS2
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
#driver = webdriver.Chrome()
driver.get("https://www.flashscore.pl/druzyna/ajax/8UOvIwnb/tabela/")
driver.implicitly_wait(10)
page = driver.page_source
soup = BS2(page,'html.parser')
content = soup.find('div',{'class':'ui-table__body'})
content_list = content.find_all('span',{"table__cell table__cell--value"})
res = []
found = 0
for i in content.find('div'):
line = i.text.split()[0]
if re.search('Ajax', line):
found = 8
if found:
found -= 1
res.append(line)
# change field 5 into separate values and skip field 6
res = res[:4] +res[5].split(':') + res[7:]
# break the last field into separate values and drop the first '?'
res = res[:-1] + [ i for i in res[-1]][1:]
print(";".join(res))
returns
Ajax;5;5;0;16;3;W;W;W;W;W
This works, but it is very brittle, and will break as soon as the website changes their content. you should put in a lot of error checking. I also replaced the sleep with a wait call, and added chromedrivermamager, which allows me to use selenium with chrome.

Layout Parser: lp.draw_box not working in for loop

I am trying to visualize pages after passing through layout parser using lp.draw_box. It works fine for a single image, but if I run through a for loop and try for each image at once from a set of images, it doesn't print anything. Could someone please help me here? THanks
Code:
from pdf2image import convert_from_path, convert_from_bytes
import cv2, time
start = time.time()
images = convert_from_path(f"/content/neft-system-faq.pdf")
page_layout_map = {}
for idx, image in enumerate(images):
image.save('out.png', 'PNG')
image = cv2.imread("out.png")
image = image[..., ::-1]
layout = model.detect(image)
lp.draw_box(image, layout,None,show_element_id=True)
text_blocks = lp.Layout([b for b in layout])
print(text_blocks)
block_test = get_coordinates(text_blocks)
result_arr = layout_map_processor(block_test)
page_layout_map[idx] = result_arr
print(time.time()-start)

In Gtk, how to make a window smaller when creating

I am trying to display both an image and a box with an Entry widget. I can do that, but the window is so large that the widget at the bottom is mostly out of view. I have tried several calls to set the window's size or unmaximize it, but they seem to have no effect. I determined that the problem only occurs when the image is large, but still wonder how to display a large image in a resizable window or, for that matter, to make any changes to the window's geometry from code. All the function call I tried seem to have no effect.
Here is my code:
import gi
gi.require_version("Gtk", "3.0")
from gi.repository import Gtk
from gi.repository import GdkPixbuf
from urllib.request import urlopen
class Display(object):
def __init__(self):
self.window = Gtk.Window()
self.window.connect('destroy', self.destroy)
self.window.set_border_width(10)
# a box underneath would be added every time you do
# vbox.pack_start(new_widget)
vbox = Gtk.VBox()
self.image = Gtk.Image()
response = urlopen('http://1.bp.blogspot.com/-e-rzcjuCpk8/T3H-mSry7PI/AAAAAAAAOrc/Z3XrqSQNrSA/s1600/rubberDuck.jpg').read()
pbuf = GdkPixbuf.PixbufLoader()
pbuf.write(response)
pbuf.close()
self.image.set_from_pixbuf(pbuf.get_pixbuf())
self.window.add(vbox)
vbox.pack_start(self.image, False, False, 0)
self.entry = Gtk.Entry()
vbox.pack_start(self.entry, True,True, 0)
self.image.show()
self.window.show_all()
def main(self):
Gtk.main()
def destroy(self, widget, data=None):
Gtk.main_quit()
a=Display()
a.main()
Most of the posted information seems to pertain to Gtk2 rather than Gtk3, but there is a solution: to use a pix buf loader and set the size:
from gi.repository import Gtk, Gdk, GdkPixbuf
#more stuff
path = config['DEFAULT']['datasets']+'working.png'
with open(path,'rb') as f:
pixels = f.read()
loader = GdkPixbuf.PixbufLoader()
loader.set_size(400,400)
loader.write(pixels)
pb = GdkPixbuf.Pixbuf.new_from_file(path)
self.main_image.set_from_pixbuf(loader.get_pixbuf())
loader.close()

How to scrape the first n paragraphs from a url

I am very new to web-scraping, and I am really having a hard time with extracting some paragraphs from a url. From the following link I'm trying to print all the paragraphs under the Cover Page and Short Summary headers. But my program is not working.
Here is my code:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import bs4
url = 'http://onepiece.wikia.com/wiki/Chapter_863'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('div',attrs={"mw-content-ltr mw-content-text"})
for x in table:
if (x.get(id) == "Cover Page"):
print (x.get('p').get_text())
elif(x.get(id) == "Short Summary"):
print (x.get('p').get_text())
When I run my program, it does not print anything nor show an error message. Is there any way where I can print only the paragraphs under the Cover Page and Short Summary section.
If we analyze the HTML source of the page we can see that we need to get the Cover Page and Short Summary:
Cover Page -> all "p" tags between h2 Cover Page and h2 short summary
Short Summary -> all "p" tags between h2 Short Summary and h2 Long
Summary
On the code, we need to find all h2 and p tags and then find the index of each h2 as our marker. When we got our markers, we then re-loop the tree and be able to get all needed paragraphs between h2 tags.
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import bs4
url = 'http://onepiece.wikia.com/wiki/Chapter_863'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find_all('div',attrs={"mw-content-ltr mw-content-text"})
for x in table:
i = 0
cover_page_mark = 0
short_summary_mark = 0
long_summary_mark = 0
cover_page = ''
short_summary = ''
for el in x.find_all(['h2', 'p']):
if el.name == 'h2':
if "Cover Page" in el.get_text() and el.name == 'h2':
cover_page_mark = i
if "Short Summary" in el.get_text() and el.name == 'h2':
short_summary_mark = i
if "Long Summary" in el.get_text() and el.name == 'h2':
long_summary_mark = i
i += 1
i = 0
for el in x.find_all(['h2', 'p']):
if el.name == 'p':
if cover_page_mark < i < short_summary_mark:
cover_page += el.get_text()
if short_summary_mark < i < long_summary_mark:
short_summary += el.get_text()
i += 1
print cover_page
print short_summary
To get the desired result making your script concise you can do something like this as well. Run it and see the magic.
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get("http://onepiece.wikia.com/wiki/Chapter_863").text,"html.parser")
for item in soup.select("#mw-content-text"):
required_data = [p_item.text.strip() for p_item in item.select("p")][1:4]
print('\n'.join(required_data))

Python3.5 BeautifulSoup4 get text from 'p' in div

I am trying to pull all the text from the div class 'caselawcontent searchable-content'. This code just prints the HTML without the text from the web page. What am I missing to get the text?
The following link is in the 'finteredcasesdoc.text' file:
http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html
import requests
from bs4 import BeautifulSoup
with open('filteredcasesdoc.txt', 'r') as openfile1:
for line in openfile1:
rulingpage = requests.get(line).text
soup = BeautifulSoup(rulingpage, 'html.parser')
doctext = soup.find('div', class_='caselawcontent searchable-content')
print (doctext)
from bs4 import BeautifulSoup
import requests
url = 'http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
I've added a much more reliable .find method ( key : value)
whole_section = soup.find('div',{'class':'caselawcontent searchable-content'})
the_title = whole_section.center.h2
#e.g. Missouri Court of Appeals,Southern District,Division Two.
second_title = whole_section.center.h3.p
#e.g. STATE of Missouri, Plaintiff-Appellant v....
number_text = whole_section.center.h3.next_sibling.next_sibling
#e.g.
the_date = number_text.next_sibling.next_sibling
#authors
authors = whole_section.center.next_sibling
para = whole_section.findAll('p')[1:]
#Because we don't want the paragraph h3.p.
# we could aslso do findAll('p',recursive=False) doesnt pickup children
Basically, I've dissected this whole tree
as for the Paragraphs (e.g. Main text, the var para), you'll have to loop
print(authors)
# and you can add .text (e.g. print(authors.text) to get the text without the tag.
# or a simple function that returns only the text
def rettext(something):
return something.text
#Usage: print(rettext(authorts))
Try printing doctext.text. This will get rid of all the HTML tags for you.
from bs4 import BeautifulSoup
cases = []
with open('filteredcasesdoc.txt', 'r') as openfile1:
for url in openfile1:
# GET the HTML page as a string, with HTML tags
rulingpage = requests.get(url).text
soup = BeautifulSoup(rulingpage, 'html.parser')
# find the part of the HTML page we want, as an HTML element
doctext = soup.find('div', class_='caselawcontent searchable-content')
print(doctext.text) # now we have the inner HTML as a string
cases.append(doctext.text) # do something useful with this !