Writing Encoded JSON data to a csv using tweepy - json

I've been stuck on this one for a while. Right now this function writes date,latitude,longitude,userid,text of a live tweet to a csv file.
The problem is that text of the tweet often contains letters from other alphabets e.g. arabic. These letters show up in this form (\u0641\u064a).
Is it possible to encode the text to a utf-8 string and append it to the rest of the data, so that the csv file would correclty display all characters?
def on_data(self, data):
try:
tweets = json.loads(data)
with open('Data.csv','a',encoding = 'utf-8') as f:
if(tweets['coordinates'] is not None):
coordinates_string = json.dumps(tweets["coordinates"]["coordinates"])
val_lg = coordinates_string.split(',')[0].strip("[")
val_lt = coordinates_string.split(',')[1].strip("]")
else:
val_lg = "None"
val_lt = "None"
text = json.dumps(tweets["text"])
user_id = json.dumps(tweets["user"]["id_str"])
time = json.dumps(tweets["created_at"])
data_string = time + "," + val_lt + "," + val_lg + "," + user_id + "," + text + "\n"
print(data_string)
f.write(data_string)
except:
pass

You've got some overuse of json. Once you loads the tweet, group the data fields in a list and use the csv module to write it nicely.
import json
import csv
# A guess on the data format of the tweet that was parsable by the OP's original code.
D = {'coordinates' : {'coordinates' : [45.6,122.3]}, 'text' : u'some text\u0641\u064a',
'user' : {'id_str' : 'some id'}, 'created_at': 'some date'}
data = json.dumps(D)
tweets = json.loads(data)
# 'utf-8-sig' makes sure the output csv will open in Excel if that is a goal.
# newline='' is a requirement for csv.writer in Python 3.
with open('Data.csv','a',encoding = 'utf-8-sig', newline='') as f:
# This forces quoting of strings like the OP got from json.dumps
w = csv.writer(f,quoting=csv.QUOTE_NONNUMERIC)
if tweets['coordinates'] is not None:
val_lg = tweets['coordinates']['coordinates'][1]
val_lt = tweets['coordinates']['coordinates'][0]
else:
val_lg = "None"
val_lt = "None"
text = tweets["text"]
user_id = tweets["user"]["id_str"]
time = tweets["created_at"]
# group the fields in a list for writerow
data = [time,val_lt,val_lg,user_id,text]
print(data)
w.writerow(data)
Output (UTF-8 terminal):
['some date', 45.6, 122.3, 'some id', 'some textفي']
Output (Data.csv):
"some date",45.6,122.3,"some id","some textفي"

Related

Loop through list of dictionaries and append to csv

I'm currently trying to collect tweets with the Twitter API. I want to merge two list dictionaries to a csv. The ['data'] list consist of ID and tweet, the second list, ['includes']['users'], consist of username and location. I have tried with two for loops in order to merge these elements, one for ['data'] and one for ['includes']['users']. But I end up having the exact same tweet and ID for every user in my csv output.
print(json.dumps(json_response, indent=4, sort_keys=True))
My data looks like this (not real tweets):
{"data": [{"author_id": "1234","id": "9999","text": "This is tweet number 1"},{"author_id": "9876","id": "1111","text": "This is another tweet"},],"includes": {"users": [{"id": "9999","location": "Earth","name": "George Huston","username": "George_Huston"},{"id": "1111","name": "Adam Sandler,"username": "adam_sandler"}]
json_response['includes']['users']
[{'name': 'George Huston','location': 'Earth','id': '9876','username': 'George_Huston'},{'name': 'Adam Sandler', 'id': '9999', 'username': 'adam_sandler}]
Creating a csv:
# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['id', 'username', 'text', 'location'])
csvFile.close()
def append_to_csv(json_response, fileName):
#A counter variable
counter = 0
#Open OR create the target CSV file
csvFile = open(fileName, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Loop through each tweet
for tweet in json_response['data']:
tweet_id = tweet['id']
text = tweet['text']
for element in json_response['includes']['users']:
username = element['username']
if ('location' in tweet):
location = element['location']
else:
location = " "
# Assemble all data in a list
res = [tweet_id,username,text,location]
# Append the result to the CSV file
csvWriter.writerow(res)
counter += 1
# When done, close the CSV file
csvFile.close()
# Print the number of tweets for this iteration
print("# of Tweets added from this response: ", counter)
append_to_csv(json_response, "data.csv")
But get this csv output:
id,username,text,location
9999,George_Huston,"This is tweet number 1",
9999,adam_sandler,"This is tweet number 1",
The id, text, location is always the same, while the username is different. How can I solve this problem?
In your for tweet in json_response['data'] loop you overwrite tweet_id and text as the loop goes on. The output you see is whatever they were set to in the last iteration of the loop.
It seems from the Twitter API that you can get usernames from the Tweet Objects as well, without json_response['includes']['users'] that you used.
Does this do what you want?
# Create file
fileName = 'data.csv'
csvFile = open("data.csv", "w", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['id', 'username', 'text', 'location'])
csvFile.close()
def append_to_csv(json_response, fileName):
#A counter variable
counter = 0
#Open OR create the target CSV file
csvFile = open(fileName, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Loop through each tweet
for tweet in json_response['data']:
tweet_id = tweet['id']
text = tweet['text']
username = tweet['username']
if ('location' in tweet):
location = element['location']
else:
location = " "
# Assemble all data in a list
res = [tweet_id,username,text,location]
# Append the result to the CSV file
csvWriter.writerow(res)
counter += 1
# When done, close the CSV file
csvFile.close()
# Print the number of tweets for this iteration
print("# of Tweets added from this response: ", counter)
append_to_csv(json_response, "data.csv")

Instaloader JSON files: Convert 200 JSON files into a Single CSV (Python 3.7)

I want to automatically download pictures (or videos) along with their captions and other data from a specific Instagram Hashtag (e.g. #moodoftheday) using Instaloader. Instaloader returns JSON files including posts metadata.
The following code worked with just a single #user_profile metadata.
I want to do the same, but for a #hashtag not a specific #user.
The ultimate goal is to have all of the JSON files (e.g. 200) into a csv file.
How can I process my downloaded data in a clean excel/CSV file?
Here is my code:
# Install Instaloader
import instaloader
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("username" , "password")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
'''
This function will now save all instagram posts and related data to a folder in you current working directory.
Let’s call this function on the instagram account of “moodoftheday”. let the script do its magic.
This might take a while so be patient.
'''
import os
import datetime
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime(2020, 9, 1)
enddate = datetime(2020, 10, 1)
# get your current working directory
current_wkdir = os.get_cwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
'''
You notice that this data is NOT yet in the right format since each post has a separate json file.
You will need to process all these json files to a consolidated excel file in order to perform analyses on the data.
'''
def parse_instafiles(username, path):
"""
This function loads in all the json files generated by the instaloader package and parses it into a csv file.
"""
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
glob('*UTC.json')
for file in glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = orjson.loads(filecontent.read())
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_excel("moodoftheday.csv")
I am very new to Python and programming overall, therefore any help is very much appreciated!!
Thank you in advance! Sofia
I made some changes it's not showing error but still needs some professional works:
import instaloader
from datetime import datetime
import datetime
from itertools import takewhile
from itertools import dropwhile
import os
import glob as glob
import json
import pandas as pd
import csv
lusername = ''
lpassword = ''
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("lusername" , "lpassword")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime.datetime(2020, 9, 1,0,0)
enddate = datetime.datetime(2022, 2, 1,0,0)
# get your current working directory
current_wkdir = os.getcwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
def parse_instafiles(username, path):
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
# glob('*UTC.json')
for file in glob.glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = json.load(filecontent)
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_csv("moodoftheday.csv")
Instaloader has an example of hashtag search on its documentation, here's the code:
from datetime import datetime
import instaloader
L = instaloader.Instaloader()
posts = instaloader.Hashtag.from_name(L.context, "urbanphotography").get_posts()
SINCE = datetime(2020, 5, 10) # further from today, inclusive
UNTIL = datetime(2020, 5, 11) # closer to today, not inclusive
k = 0 # initiate k
#k_list = [] # uncomment this to tune k
for post in posts:
postdate = post.date
if postdate > UNTIL:
continue
elif postdate <= SINCE:
k += 1
if k == 50:
break
else:
continue
else:
L.download_post(post, "#urbanphotography")
# if you want to tune k, uncomment below to get your k max
#k_list.append(k)
k = 0 # set k to 0
#max(k_list)
here's the link for more info:
https://instaloader.github.io/codesnippets.html
I'm trying do to something similar, but I'm still very new to programming, so I'm sorry if I can't offer much help

Twitter streaming script is throwing a keyerror on location field of the tweet

I have as of now written a Python script to stream tweets and I have made use of the tweepy module to do so. After streaming for around 3 minutes for tweets, I dump these tweets into a .json file. I populate these tweets (I try to) into a pandas dataframe for location and text fields of the tweet. The text field of the tweet gets populated but not for every tweet (problem 1) in the .json file and as far as the location field is concerned a keyerror (problem 2) is thrown. May I know what exactly is going wrong.
twitter_stream_dump.py
import time
import json
import pandas as pd
import re
#tweepy based modules
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
#initializing authentication credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener) :
def __init__(self,time_limit) :
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('requests.json','a')
super(StdOutListener,self).__init__()
def on_data(self, data) :
if ((time.time() - self.start_time) < self.limit) :
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else :
self.saveFile.close()
return False
def on_error(self, status) :
print(status)
def getwords(string) :
return re.findall(r"[\w'#]+|[.,!?;]",string)
if __name__ == '__main__' :
#This handles Twitter authetification and the connection to Twitter Streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
time_limit = input("Enter the time limit in minutes : ")
time_limit *= 60
stream = Stream(auth,listener = StdOutListener(time_limit))
string = raw_input("Enter the list of keywords/hashtags to be compared : ")
keyword_list = getwords(string)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track = keyword_list)
tweets_data_path = 'requests.json'
tweets_data = []
tweet_list = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file :
try :
tweet = json.loads(line)
tweet_list.append(tweet)
except :
continue
num_tweets_collected = len(tweet_list)
#Creates a data frame structure
tweet_dataframe = pd.DataFrame()
text_dump = open('text_dump.txt', 'w')
#Populating the location field of the data frame
#tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
tweet_dataframe['text'] = map(lambda tweet : tweet['text'], tweet_list)
print(tweet_dataframe['text'])
Errors :
abhijeet-mohanty-2:Desktop SubrataMohanty$ python twitter_stream_dump.py
Enter the time limit in minutes : 3
Enter the list of keywords/hashtags to be compared : python ruby scala
Traceback (most recent call last):
File "twitter_stream_dump.py", line 81, in <module>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
File "twitter_stream_dump.py", line 81, in <lambda>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
KeyError: 'location'
requests.json (My .json file)
https://drive.google.com/file/d/0B1p05OszaBkXLWFsQ2VmeWVjbDQ/view?usp=sharing
The location field is a user-defined value and will sometimes not be present.
That's why you're getting the KeyError.
Note that location is part of the "user profile" metadata that comes with a tweet. It's intended to describe a user's location (like their hometown), and not the geotagged location of a given tweet.
In case you're interested in geotags, first check a tweeet to see if the geo_enabled field is true. If so, the geo, coordinates, and place fields may contain geotagged information.
As for missing text entries, I don't see the same issue when using the data you provided. It's possible the issue was caused by your try/except clause when reading in lines of data. Consider this approach:
for i, line in enumerate(tweets_file):
if line.rstrip():
tweet = json.loads(line)
tweet_list.append(tweet)
num_tweets_collected = len(tweet_list)
texts = [tweet['text'] for tweet in tweet_list]
tweet_dataframe = pd.DataFrame(texts, columns=['text'])
Sample output:
print(tweet_dataframe.head())
# text
# 0 Tweets and python BFF <3 15121629.976126991
# 1 RT #zeroSteiner: Can now write more post modul...
# 2 •ruby• #MtvInstagLSelena #MtvColabTaylors
# 3 Ruby Necklace July Birthstone Jewelry Rosary...
# 4 #ossia I didn't see any such thing as Python. ...
A few quick summary stats show that no lines are missing, and no entries are null:
print("N tweets: {}".format(num_tweets_collected))
# N tweets: 286
print("N rows in dataframe: {}".format(tweet_dataframe.shape[0]))
# N rows in dataframe: 286
null_count = tweet_dataframe.text.isnull().sum()
print("Tweets with no text field extracted: {}".format(null_count))
# Tweets with no text field extracted: 0

Python Replace throwing errors when replacing "</html>"

I am very new to Python and I'm trying to understand and use the script from this link in Anaconda running on Python 3.5.2. I have had to change some things so that the script can run in this version of Python since it is from 2013. The script (as amended by inexperienced me) is as follows and my problem is in the try block in the line html = f.read().replace("</html>", "") + "</html>".
I simply cannot understand the reason of the + "</html>" that comes after the close parenthesis. From what I have found out on the replace() method is that it takes at least two parameters, the old character/s and the new ones. As it is, this script is jumping to the except Exception as e: and prints out a bytes-like object is required, not 'str'.
Now this is, as far as I can tell, because the reading is being done as bytes whereas the replace method takes strings. I tried to divide the line into:
html = f.read
html = str.replace("</html>", "") + "</html>"
but this throws replace() takes at least 2 arguments (1 given). I also tried changing the contents of html from bytes to str as follows
html = str(f.read(), 'utf-8')
html = str.replace("</html>", "")
but this also returns the error that replace() takes two arguments (1 given). When I removed the html = str.replace("</html>", "") + "</html>" altogether and so skipped to the soup = BeautifulSoup(html), I ended up with a warning that no parser was explicitly specified and later on an AttributeError that NoneType object has no attribute get_dictionary.
Any help about the need for the mentioned line and why it is used and how to use it would be greatly appreciated. Thank you.
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
# url = 'http://twitter.com/%s/status/%s' % (uid, sid)
# print url
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
print('URL: ', f.geturl())
# Thanks to Arturo!
# html = f.read()
html = f.read().replace("</html>", "") + "</html>"
soup = BeautifulSoup(html)
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print len(tweets)
# print tweets
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if js.has_key("embedData"):
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
# except Exception as e:
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print json.dumps(tweet, indent=2)
print("\t".join(fields + [text]).encode('utf-8'))
str.replace is using replace in its static form (calling the method from the type-class str instead of an str object).
str.replace will actually need 3 arguments: the string to act on, the char or string to replace and the new char or string.
'abcd'.replace('d', 'z') is equivallent to str.replace('abcd', 'd', 'z'):
print('abcd'.replace('d', 'z'))
# abcz
print(str.replace('abcd', 'd', 'z'))
# abcz
I have accepted the solution kindly given by #DeepSpace as an answer as it helped me to realise how to overcome the problem I was facing. The code below can now execute under Python 3 if run from command prompt as follows (Please note that I executed this from Windows command prompt):
python download_tweets.py inpuot_file.tsv > output_file.tsv. The code follows:
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
# print('URL: ', f.geturl())
# Thanks to Arturo!
html = str.replace(str(f.read(), 'utf-8'), "</html>", "")
# html = f.read().replace("</html>", "") + "</html>" # original line
soup = BeautifulSoup(html, "lxml") # added "lxml" as it was giving warnings
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print(len(tweets))
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if "embedData" in js:
# if js.has_key("embedData"): # original line
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print(json.dumps("dump: ", tweet, indent=2))
print(" \t ".join(fields + [text]).encode('utf-8'))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 7240: character maps to <undefined>

I am student doing my master thesis. As part of my thesis, I am working with python. I am reading a log file of .csv format and writing the extracted data to another .csv file in a well formatted way. However, when the file is read, I am getting this error:
Traceback (most recent call last): File
"C:\Users\SGADI\workspace\DAB_Trace\my_code\trace_parcer.py", line 19,
in for row in reader:
File "C:\Users\SGADI\Desktop\Python-32bit-3.4.3.2\python-3.4.3\lib\encodings\cp1252.py",
line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 7240: character maps to <undefined>
import csv
import re
#import matplotlib
#import matplotlib.pyplot as plt
import datetime
#import pandas
#from dateutil.parser import parse
#def parse_csv_file():
timestamp = datetime.datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
timestamp_list = []
snr_list = []
freq_list = []
rssi_list = []
dab_present_list = []
counter = 0
f = open("output.txt","w")
with open('test_log_20150325_gps.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=';')
for row in reader:
#timestamp = datetime.datetime.strptime(row[0], '%M:%S.%f')
#timestamp.split(" ",1)
timestamp = row[0]
timestamp_list.append(timestamp)
#timestamp = row[0]
details = row[-1]
counter += 1
print (counter)
#if(counter > 25000):
# break
#timestamp = datetime.datetime.strptime(row[0], '%M:%S.%f')
#timestamp_list.append(float(timestamp))
#search for SNRLevel=\d+
snr = re.findall('SNRLevel=(\d+)', details)
if snr == []:
snr = 0
else:
snr = snr[0]
snr_list.append(int(snr))
#search for Frequency=09ABC
freq = re.findall('Frequency=([0-9a-fA-F]+)', details)
if freq == []:
freq = 0
else:
freq = int(freq[0], 16)
freq_list.append(int(freq))
#search for RSSI=\d+
rssi = re.findall('RSSI=(\d+)', details)
if rssi == []:
rssi = 0
else:
rssi = rssi[0]
rssi_list.append(int(rssi))
#search for DABSignalPresent=\d+
dab_present = re.findall('DABSignalPresent=(\d+)', details)
if dab_present== []:
dab_present = 0
else:
dab_present = dab_present[0]
dab_present_list.append(int(dab_present))
f.write(str(timestamp) + "\t")
f.write(str(freq) + "\t")
f.write(str(snr) + "\t")
f.write(str(rssi) + "\t")
f.write(str(dab_present) + "\n")
print (timestamp, freq, snr, rssi, dab_present)
#print (index+1)
#print(timestamp,freq,snr)
#print (counter)
#print(timestamp_list,freq_list,snr_list,rssi_list)
'''if snr != []:
if freq != []:
timestamp_list.append(timestamp)
snr_list.append(snr)
freq_list.append(freq)
f.write(str(timestamp_list) + "\t")
f.write(str(freq_list) + "\t")
f.write(str(snr_list) + "\n")
print(timestamp_list,freq_list,snr_list)'''
f.close()
I searched for the special character and I did not find any. I searched the Internet which suggested to change the format: I tried ut8, latin1 and few other formats, but i am still getting this error. Can you please help me how to solve with pandas as well. I also tried with pandas but I am still getting the error.
I even removed a line in the log file, but the error occurs in the next line.
Please help me finding a solution, thank you.
i have solved this issue.
we can use this code
import codecs
types_of_encoding = ["utf8", "cp1252"]
for encoding_type in types_of_encoding:
with codecs.open(filename, encoding = encoding_type, errors ='replace') as csvfile:
your code
....
....
I have solved this issue by simply adding a parameter in open()
with open(filename, encoding = 'cp850') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
with open('input.tsv','rb') as f:
for ln in f:
decoded=False
line=''
for cp in ('cp1252', 'cp850','utf-8','utf8'):
try:
line = ln.decode(cp)
decoded=True
break
except UnicodeDecodeError:
pass
if decoded:
# use 'line'