Python, UnicodeEncodeError - csv

Hello I've got this piece of code
import urllib.request
import string
import time
import gzip
from io import BytesIO
from io import StringIO
from zipfile import ZipFile
import csv
import datetime
from datetime import date
import concurrent.futures
den = date.today().replace(day=1) - datetime.timedelta(days=1)
url = '' + den.strftime("%Y%m%d") + '_OB_ADR_csv.zip'
data = urllib.request.urlopen(url).read()
zipdata = BytesIO()
zipdata.write(data)
csvfile = open('./test.csv', 'w', newline='')
csvwrite = csv.writer(csvfile, delimiter=';')
with ZipFile(zipdata) as zip:
for i, nazev in enumerate(zip.namelist()):
if i == 0:
continue
csvstring = StringIO(str(zip.read(nazev), encoding='windows-1250'))
csvreader = csv.reader(csvstring, delimiter=';')
for j, row in enumerate(csvreader):
if j == 0 and i != 1:
continue
csvwrite.writerow(row)
csvfile.close()
When i run it it sometimes throws "UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in position 1: ordinal not in range(128)" at "csvwrite.writerow(row)"
How can I solve this issue? Thank you.
EDIT:
I run it under Python 3.3

You didn't tell csv.writer about the encoding. Take a look at the pydocs for the csv module:
To decode a file using a different encoding, use the encoding argument
of open...[t]he same applies to writing in something other than the
system default encoding: specify the encoding argument when opening
the output file.
You can see from the UnicodeEncodeError that Python thinks you want the file written in ascii. Just specify the encoding parameter and choose your desired encoding (my suggestion is encoding='utf-8').

Related

Cannot read JSON with Pandas a file encoded in UCS-2 Little Endian

with open(filename+'.json') as json_file:
data=pd.io.json.read_json(json_file,encoding='utf_16_be')
I tried multiple options for encoding but it fails. It returns empty object. I can convert only when save my file in Notepad++ as UTF8 without BOM. I open it as normally with default encoding:
with open(filename+'.json') as json_file:
data=pd.io.json.read_json(json_file)
Default encoding of the file is UTC-2 Little Endian. How to read json with this encoding?
Read and follow import pandas as pd; help (pd.io.json.read_json). The following (partially commented) code snippet could help:
filename = r"D:\PShell\DataFiles\61571258" # my test case
import pandas as pd
filepath = filename + ".json"
# define encoding while opening a file
with open(filepath, encoding='utf-16') as f:
data = pd.io.json.read_json(f)
# or open file in binary mode and decode while converting to pandas object
with open(filepath, mode='rb') as f:
atad = pd.io.json.read_json(f, encoding='utf-16')
# ensure that both above methods are equivalent
print((data == atad).values)
Output: .\SO\69537408.py
[[ True True True True True True True]]

Unable to print output of JSON code into a .csv file

I'm getting the following errors when trying to decode this data, and the 2nd error after trying to compensate for the unicode error:
Error 1:
write.writerows(subjects)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u201c' in position 160: ordinal not in range(128)
Error 2:
with open("data.csv", encode="utf-8", "w",) as writeFile:
SyntaxError: non-keyword arg after keyword arg
Code
import requests
import json
import csv
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://thisiscriminal.com/wp-json/criminal/v1/episodes?posts=10000&page=1')
data = json.loads(r.read().decode('utf-8'))
subjects = []
for post in data['posts']:
subjects.append([post['title'], post['episodeNumber'],
post['audioSource'], post['image']['large'], post['excerpt']['long']])
with open("data.csv", encode="utf-8", "w",) as writeFile:
write = csv.writer(writeFile)
write.writerows(subjects)
Using requests and with the correction to the second part (as below) I have no problem running. I think your first problem is due to the second error (is a consequence of that being incorrect).
I am on Python3 and can run yours with my fix to open line and with
r = urllib.request.urlopen('https://thisiscriminal.com/wp-json/criminal/v1/episodes?posts=10000&page=1')
I personally would use requests.
import requests
import csv
data = requests.get('https://thisiscriminal.com/wp-json/criminal/v1/episodes?posts=10000&page=1').json()
subjects = []
for post in data['posts']:
subjects.append([post['title'], post['episodeNumber'],
post['audioSource'], post['image']['large'], post['excerpt']['long']])
with open("data.csv", encoding ="utf-8", mode = "w",) as writeFile:
write = csv.writer(writeFile)
write.writerows(subjects)
For your second, looking at documentation for open function, you need to use the right argument names and add the name of the mode argument if not positional matching.
with open("data.csv", encoding ="utf-8", mode = "w") as writeFile:

How to remove unusual characters from JSON dump in Python?

I have been searching around for a good way to remove all unusual characters from a JSON dump of tweets that I am using to compile a dataset for sentiment analysis.
characters I am trying to remove = ンボ チョボ付 最安値
These characters appear in my tweet data and I am trying to remove them using regex but to no avail.
import json
import csv
import pandas as pd
import matplotlib.pyplot as plt
tweets_data_path = 'twitter_data.txt'
tweets_data = []
tweets_text_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
for tweet in tweets_data:
if tweet['text']:
tweets_text_data.append(tweet['text'])
print(tweets_text_data)
with open('dataset_file', 'w') as dataset_file:
writer = csv.writer(dataset_file)
writer.writerow(tweets_text_data)
I tried using re.sub() to take away these charcters but it will not work. How can I make this work?

Can't load dataset into ipython. UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcd in position 1: invalid continuation byte

Fairly new to using ipython so I'm still getting confused quite easily. Here is my code so far. After loading I have to display only the first 5 rows of the file.
# Import useful packages for data science
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load concerts.csv
path1 = 'C:\\Users\\Cathal\\Documents\\concerts.csv'
concerts = pd.read_csv(path1)
Thanks in advance for any help.
try
concerts = pd.read_csv(path1, encoding = 'utf8')
if that doesnt work try
concerts = pd.read_csv(path1, encoding = "ISO-8859-1")

Python folder contents CSV writer

I'm trying to make a simple command line script with Python code that generates a CSV when it scans the contents of a directory, but I'm not sure if I'm doing it correctly, cause I keep getting errors. Can someone tell me what the heck I'm doing wrong?
import sys
import argparse
import os
import string
import fnmatch
import csv
from string import Template
from os import path
from os.path import basename
header = ["Title","VersionData","PathOnClient","OwnerId","FirstPublishLocationId","RecordTypeId","TagsCsv"]
if not sys.argv.len < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames = header, delimiter=',')
writer.writeheader()
if os.path.isdir(sys.argv[2]):
for d in os.scandir(sys.argv[2]):
row = Template('"$title","$path","$path"') #some default values in the template were omitted here
writer.writerow(row.substitute(title=basename(d.path)), path=path.abspath(d.path))
Right off the bat, csvwriter.writerow(row) takes only one argument. You need to wrap your arguments inside brackets and then join with comma.
Moreover, you cannot call other functions within the row object, which is what you are trying to do with row.substitute(args) etc.
Figured it out. For anyone else needing a quick CSV listing of folders, here's the code I got to work:
#!/usr/bin/env python3
import sys, os, csv
from string import Template
from pathlib import PurePath, PureWindowsPath
from os.path import basename
header = ["Title","Path","","","","",""] # insert what header you need, if any
if not len(sys.argv) < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames=header, dialect='excel', delimiter=',', quoting=csv.QUOTE_ALL)
writer.writeheader()
initPath = os.path.abspath(sys.argv[2])
if sys.platform.startswith('linux') or sys.platform.startswith('cygwin') or sys.platform.startswith('darwin'):
p = PurePath(initPath)
else:
if sys.platform.startswith('win32'):
p = PureWindowsPath(initPath)
if os.path.isdir(str(p)) and not str(p).startswith('.'):
for d in os.scandir(str(p)):
srow = Template('"$title","$path", "","","",""')
#s = srow.substitute({'title': basename(d.path), 'path': os.path.abspath(d.path)) #
#print(s) # this is for testing if the content produces what's expected
row = {'Title': basename(d.path), 'Path': os.path.abspath(d.path)} # the dictionary must have the same number of entries as the number of header fields your CSV is going to contain.
writer.writerow(row)