How to take output of nltk.book, common_contexts function to a variable - nltk

common_contexts in nltk.book returns NoneType and hence how is it possible to store its output to a variable
from nltk.book import *
wtc=text1.common_contexts(['you'])
print(wtc)
print(type(wtc))
wtc variable above will return NONE.

What I tried was to write the ouput of common_context to a file ans then read from it using splitlines
from nltk.book import *
import contextlib
fnallst=[]
for w in set(text2): ## we choose text2 as text2 has lesser distinct word
filename = "C:\\Users\\username\Log"
with open(filename,'w') as f:
with contextlib.redirect_stdout(f):
text2.common_contexts([w])
with open(filename) as f:
wct2 = f.read().splitlines()

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

TypeError: document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, a type that inherits from collections.MutableMapping

I am trying to write data into pymongo and this the TypeError that I am getting. The Type for mydict1 is List. Do I have to convert my data into json or bson before I write it to pymongo? Kindly help.
Thanks.
from numpy.polynomial import Polynomial as poly
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import json
import pandas as pd
df = pd.read_csv(r'D:\polynomial\points.csv')
print(df)
x= np.array(df['Wavelength(A)'].tolist())
x= np.divide([299792.458], x)
y= np.array(df['Level(A)'].tolist())
x_trimmed = np.delete(x, np.where(y < 1e-4))
y_trimmed = np.delete(y, np.where(y < 1e-4))
test= poly.fit(x_trimmed, y_trimmed, 10)
print (test)
list1= test.convert().coef
print (list1)
print (len(list1))
#print (type(list1))
to_list= list1.tolist()
#print(to_list)
#data_format= json.dumps(to_list)
l = len(to_list)
#print (l)
mydict1= []
for i in range(l):
mydict = { "a"+str(i) : to_list[i] }
mydict1.append(mydict)
print (mydict1)
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["mydatabase"]
mycol = mydb["coefficients"]
x = mycol.insert_one(mydict1)
This is mydict1=
[{'a0': -2.3373800910827825e+34}, {'a1': 1.2084654060419298e+33}, {'a2': -2.811587585787653e+31}, {'a3': 3.876370042231405e+29}, {'a4': -3.507261557232249e+27}, {'a5': 2.1759768836934694e+25}, {'a6': -9.37514311649608e+22}, {'a7': 2.7697765301392782e+20}, {'a8': -5.370081422614614e+17}, {'a9': 616983041924503.2}, {'a10': -318990754999.1472}]
The problem is that MongoDB's insert_one method inserts a single document that is represented by a dictionary, not a list.
The possible solutions are:
use insert_many instead. In this case, you will have every list item as a separate mongodb document
make a dict with your list values. You can use something like {"items": mydict1}, or reduce(lambda x, y: x | y, mydict1) depending on the document structure that will be better for your needs

How to handle newlines when loading a CSV into Apache Beam?

I am running into an issue where in some of my fields, there are new lines within the text.
My current code is as follows:
# Python's regular expression library
import re
import sys
# Beam and interactive Beam imports
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib
p = beam.Pipeline(InteractiveRunner())
def print_row(element):
print(element)
def parse_file(element):
for line in csv.reader([element], quotechar='"', delimiter=',', lineterminator='\n', quoting=csv.QUOTE_ALL, skipinitialspace=True):
return line
parsed_csv = p | 'Read input file' >> beam.io.ReadFromText("gs://ny-data/AB_NYC_2019.csv")| 'Parse file' >> beam.Map(parse_file)
split = parsed_csv | beam.Map(lambda x: x[0]) | beam.Map(print)
p.run()
I am running into issues because some of the text appears as so:
The BLUE OWL:
VEGETARIAN WBURG W PATIO & BACKYARD!
Any thoughts on how to proceed?
ReadFromText reads inputs one line at a time. As suggested before, you can use the Dataframe read_csv, or you could create a PCollection of paths and open/read them in a DoFn.
For example, you could write
def read_csv_file(file_metadata):
with beam.io.filesystems.FileSystems.open(file_metadata.path) as fin:
for row in csv.reader(fin):
yield row
rows = (
p
| beam.io.fileio.MatchFiles('/pattern/to/files/*.csv') # emits FileMetadatas
| beam.FlatMap(read_csv_file)) # emits rows

How to import csv file data in aerospike without using aerospike loader?Is there any alternative available for aerospike loader?

I want to import following csv file data in aerospike and want to fire simple select query to display data using python as a client
e.g
policyID,statecode,county,eq_site_limit,hu_site_limit,fl_site_limit,fr_site_limit,tiv_2011,tiv_2012,eq_site_deductible,hu_site_deductible,fl_site_deductible,fr_site_deductible,point_latitude,point_longitude,line,construction,point_granularity
119736,FL,CLAY COUNTY,498960,498960,498960,498960,498960,792148.9,0,9979.2,0,0,30.102261,-81.711777,Residential,Masonry,1
448094,FL,CLAY COUNTY,1322376.3,1322376.3,1322376.3,1322376.3,1322376.3,1438163.57,0,0,0,0,30.063936,-81.707664,Residential,Masonry,3
query = client.query( 'test', 'csvfile' )
query.select( 'policyID', 'statecode' )
You could try to use the python csv module along with Aerospike Python client:
https://docs.python.org/2/library/csv.html
http://www.aerospike.com/docs/client/python/
And do something similar to the following:
import aerospike
import sys
import csv
global rec
rec = {}
csvfile = open('aerospike.csv', "rb")
reader = csv.reader(csvfile)
rownum = 0
for row in reader:
# Save First Row with headers
if rownum == 0:
header = row
else:
colnum = 0
for col in row:
# print (rownum,header[colnum],col)
rec[header[colnum]] = col
colnum += 1
rownum += 1
# print(rownum,rec)
if rec:
client.put(('test', 'demo', str(rownum)), rec)
rec = {}
csvfile.close()
Note: You may need to check the size of your headers and make sure they do not exceed 14 characters.
if not you could get the following:
error: (21L, 'A bin name should not exceed 14 characters limit', 'src/main/conversions.c', 500)
as far as I am aware there is no pre-built tool other than the loader that allows you to import CSV. You could, perhaps, build one using the existing client tools.

Python folder contents CSV writer

I'm trying to make a simple command line script with Python code that generates a CSV when it scans the contents of a directory, but I'm not sure if I'm doing it correctly, cause I keep getting errors. Can someone tell me what the heck I'm doing wrong?
import sys
import argparse
import os
import string
import fnmatch
import csv
from string import Template
from os import path
from os.path import basename
header = ["Title","VersionData","PathOnClient","OwnerId","FirstPublishLocationId","RecordTypeId","TagsCsv"]
if not sys.argv.len < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames = header, delimiter=',')
writer.writeheader()
if os.path.isdir(sys.argv[2]):
for d in os.scandir(sys.argv[2]):
row = Template('"$title","$path","$path"') #some default values in the template were omitted here
writer.writerow(row.substitute(title=basename(d.path)), path=path.abspath(d.path))
Right off the bat, csvwriter.writerow(row) takes only one argument. You need to wrap your arguments inside brackets and then join with comma.
Moreover, you cannot call other functions within the row object, which is what you are trying to do with row.substitute(args) etc.
Figured it out. For anyone else needing a quick CSV listing of folders, here's the code I got to work:
#!/usr/bin/env python3
import sys, os, csv
from string import Template
from pathlib import PurePath, PureWindowsPath
from os.path import basename
header = ["Title","Path","","","","",""] # insert what header you need, if any
if not len(sys.argv) < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames=header, dialect='excel', delimiter=',', quoting=csv.QUOTE_ALL)
writer.writeheader()
initPath = os.path.abspath(sys.argv[2])
if sys.platform.startswith('linux') or sys.platform.startswith('cygwin') or sys.platform.startswith('darwin'):
p = PurePath(initPath)
else:
if sys.platform.startswith('win32'):
p = PureWindowsPath(initPath)
if os.path.isdir(str(p)) and not str(p).startswith('.'):
for d in os.scandir(str(p)):
srow = Template('"$title","$path", "","","",""')
#s = srow.substitute({'title': basename(d.path), 'path': os.path.abspath(d.path)) #
#print(s) # this is for testing if the content produces what's expected
row = {'Title': basename(d.path), 'Path': os.path.abspath(d.path)} # the dictionary must have the same number of entries as the number of header fields your CSV is going to contain.
writer.writerow(row)