Python folder contents CSV writer - csv

I'm trying to make a simple command line script with Python code that generates a CSV when it scans the contents of a directory, but I'm not sure if I'm doing it correctly, cause I keep getting errors. Can someone tell me what the heck I'm doing wrong?
import sys
import argparse
import os
import string
import fnmatch
import csv
from string import Template
from os import path
from os.path import basename
header = ["Title","VersionData","PathOnClient","OwnerId","FirstPublishLocationId","RecordTypeId","TagsCsv"]
if not sys.argv.len < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames = header, delimiter=',')
writer.writeheader()
if os.path.isdir(sys.argv[2]):
for d in os.scandir(sys.argv[2]):
row = Template('"$title","$path","$path"') #some default values in the template were omitted here
writer.writerow(row.substitute(title=basename(d.path)), path=path.abspath(d.path))

Right off the bat, csvwriter.writerow(row) takes only one argument. You need to wrap your arguments inside brackets and then join with comma.
Moreover, you cannot call other functions within the row object, which is what you are trying to do with row.substitute(args) etc.

Figured it out. For anyone else needing a quick CSV listing of folders, here's the code I got to work:
#!/usr/bin/env python3
import sys, os, csv
from string import Template
from pathlib import PurePath, PureWindowsPath
from os.path import basename
header = ["Title","Path","","","","",""] # insert what header you need, if any
if not len(sys.argv) < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames=header, dialect='excel', delimiter=',', quoting=csv.QUOTE_ALL)
writer.writeheader()
initPath = os.path.abspath(sys.argv[2])
if sys.platform.startswith('linux') or sys.platform.startswith('cygwin') or sys.platform.startswith('darwin'):
p = PurePath(initPath)
else:
if sys.platform.startswith('win32'):
p = PureWindowsPath(initPath)
if os.path.isdir(str(p)) and not str(p).startswith('.'):
for d in os.scandir(str(p)):
srow = Template('"$title","$path", "","","",""')
#s = srow.substitute({'title': basename(d.path), 'path': os.path.abspath(d.path)) #
#print(s) # this is for testing if the content produces what's expected
row = {'Title': basename(d.path), 'Path': os.path.abspath(d.path)} # the dictionary must have the same number of entries as the number of header fields your CSV is going to contain.
writer.writerow(row)

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

Export JSON to CSV using Python

I wrote a code to extract some information from a website. the output is in JSON and I want to export it to CSV. So, I tried to convert it to a pandas dataframe and then export it to CSV in pandas. I can print the results but still, it doesn't convert the file to a pandas dataframe. Do you know what the problem with my code is?
# -*- coding: utf-8 -*-
# To create http request/session
import requests
import re, urllib
import pandas as pd
from BeautifulSoup import BeautifulSoup
url = "https://www.indeed.com/jobs?
q=construction%20manager&l=Houston&start=10"
# create session
s = requests.session()
html = s.get(url).text
# exctract job IDs
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' +
urllib.quote(job_ids)
# do Ajax request and convert the response to json
ajax_content = s.get(ajax_url).json()
print(ajax_content)
#Convert to pandas dataframe
df = pd.read_json(ajax_content)
#Export to CSV
df.to_csv("c:\\users\\Name\desktop\\newcsv.csv")
The error message is:
Traceback (most recent call last):
File "C:\Users\Mehrdad\Desktop\Indeed 06.py", line 21, in
df = pd.read_json(ajax_content)
File "c:\python27\lib\site-packages\pandas\io\json\json.py", line 408, in read_json
path_or_buf, encoding=encoding, compression=compression,
File "c:\python27\lib\site-packages\pandas\io\common.py", line 218, in get_filepath_or_buffer
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
ValueError: Invalid file path or buffer object type:
The problem was that nothing was going into the dataframe when you called read_json() because it was a nested JSON dict:
import requests
import re, urllib
import pandas as pd
from pandas.io.json import json_normalize
url = "https://www.indeed.com/jobs?q=construction%20manager&l=Houston&start=10"
s = requests.session()
html = s.get(url).text
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.quote(job_ids)
ajax_content= s.get(ajax_url).json()
df = json_normalize(ajax_content).transpose()
df.to_csv('your_output_file.csv')
Note that I called json_normalize() to collapse the nested columns from the JSON. I also called transpose() so that the rows were labelled with the job ID rather than columns. This will give you a dataframe that looks like this:
0079ccae458b4dcf <p><b>Company Environment: </b></p><p>Planet F...
0c1ab61fe31a5c62 <p><b>Commercial Construction Project Manager<...
0feac44386ddcf99 <div><div>Trendmaker Homes is currently seekin...
...
It's not really clear what your expected output is, though ... what are you expecting the DataFrame/CSV file to look like?. If you actually were looking for just a single row/Series with the job ID's as column labels, just remove the call to transpose()

How to remove unusual characters from JSON dump in Python?

I have been searching around for a good way to remove all unusual characters from a JSON dump of tweets that I am using to compile a dataset for sentiment analysis.
characters I am trying to remove = ンボ チョボ付 最安値
These characters appear in my tweet data and I am trying to remove them using regex but to no avail.
import json
import csv
import pandas as pd
import matplotlib.pyplot as plt
tweets_data_path = 'twitter_data.txt'
tweets_data = []
tweets_text_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
for tweet in tweets_data:
if tweet['text']:
tweets_text_data.append(tweet['text'])
print(tweets_text_data)
with open('dataset_file', 'w') as dataset_file:
writer = csv.writer(dataset_file)
writer.writerow(tweets_text_data)
I tried using re.sub() to take away these charcters but it will not work. How can I make this work?

Python: How to save *.dat-files as *.csv-files to new folder

I have a folder with lots of *.dat files (which were created with the program IDL). I am able to take one single file, convert it to a *.csv file and save it in a different (already existing) folder:
import idlsave
import csv
input_file = idlsave.read("C:/Users/RAW/06211714.dat")
n = input_file["raw"]
with open("C:/Users/CSV/06211714.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
The line input_file = idlsave.read("C:/Users/RAW/06211714.dat") shows the following output:
Available variables: raw class ['numpy.recarray']
So, this works fine for just taking one file, but I am looking for a way to take all *.dat files at once and convert each of them to a *.csv file with their original name.
I was thinking of something like this, but it didn't work:
import glob
for filename in glob.glob("C:/Users/RAW/*.dat"):
for element in filename:
i = idlsave.read(element)
n = i["raw"]
with open("C:/Users/CSV/*.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
Can someone please give me some advice?
Thanks.
import csv
import idlsave
from os import listdir
from os.path import isfile, join, splitext
dat_folder = "/folder/to/dat/files/"
csv_folder = "/folder/to/save/new/csv/files/"
onlyfilenames = [f for f in listdir(dat_folder) if isfile(join(dat_folder,f))]
for fullfilename in onlyfilenames:
file_name, file_extension = splitext(fullfilename)
if file_extension == ".dat":
input_file = idlsave.read(dat_folder + fullfilename)
n = input_file["raw"]
with open(join(csv_folder, file_name + ".csv"), "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)

Python, UnicodeEncodeError

Hello I've got this piece of code
import urllib.request
import string
import time
import gzip
from io import BytesIO
from io import StringIO
from zipfile import ZipFile
import csv
import datetime
from datetime import date
import concurrent.futures
den = date.today().replace(day=1) - datetime.timedelta(days=1)
url = '' + den.strftime("%Y%m%d") + '_OB_ADR_csv.zip'
data = urllib.request.urlopen(url).read()
zipdata = BytesIO()
zipdata.write(data)
csvfile = open('./test.csv', 'w', newline='')
csvwrite = csv.writer(csvfile, delimiter=';')
with ZipFile(zipdata) as zip:
for i, nazev in enumerate(zip.namelist()):
if i == 0:
continue
csvstring = StringIO(str(zip.read(nazev), encoding='windows-1250'))
csvreader = csv.reader(csvstring, delimiter=';')
for j, row in enumerate(csvreader):
if j == 0 and i != 1:
continue
csvwrite.writerow(row)
csvfile.close()
When i run it it sometimes throws "UnicodeEncodeError: 'ascii' codec can't encode character '\xf3' in position 1: ordinal not in range(128)" at "csvwrite.writerow(row)"
How can I solve this issue? Thank you.
EDIT:
I run it under Python 3.3
You didn't tell csv.writer about the encoding. Take a look at the pydocs for the csv module:
To decode a file using a different encoding, use the encoding argument
of open...[t]he same applies to writing in something other than the
system default encoding: specify the encoding argument when opening
the output file.
You can see from the UnicodeEncodeError that Python thinks you want the file written in ascii. Just specify the encoding parameter and choose your desired encoding (my suggestion is encoding='utf-8').