I have such a dataset. User can choose certain time interval via slider to adjust time interval in minutes. I would like display the size of dataset at the beginning of web page. If user changes the time interval, the size of dataset should change dynamically as well as showing how many datapoints are removed dynamically.
import random
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
from dash import Dash, html, dcc, Input, Output,dash_table
df = pd.DataFrame({'DATE_TIME':pd.date_range('2022-11-01', '2022-11-06 23:00:00',freq='20min'),
'ID':[random.randrange(1, 3) for n in range(430)]})
df['VALUE1'] = [random.uniform(110, 160) for n in range(430)]
df['VALUE2'] = [random.uniform(50, 80) for n in range(430)]
df['INSPECTION'] = df['DATE_TIME'].dt.day
df['MODE'] = np.select([df['INSPECTION']==1, df['INSPECTION'].isin([2,3])], ['A', 'B'], 'C')
df['TIME'] = df['DATE_TIME'].dt.time
df['TIME'] = df['TIME'].astype('str')
df['TIMEINTERVAL'] = df.DATE_TIME.diff().astype('timedelta64[m]')
df['TIMEINTERVAL'] = df['TIMEINTERVAL'].fillna(0)
def to_day_period(s):
bins = ['0', '06:00:00', '13:00:00', '18:00:00', '23:00:00', '24:00:00']
labels = ['Nighttime', 'Daytime', 'Daytime', 'Nighttime', 'Nighttime']
return pd.cut(
pd.to_timedelta(s),
bins=list(map(pd.Timedelta, bins)),
labels=labels, right=False, ordered=False
)
df['TIME_OF_DAY'] = to_day_period(df['TIME'])
app = Dash(__name__)
app.layout = html.Div([
html.H4('Interactive Scatter Plot'),
html.P("Select ID:"),
dcc.Dropdown(df.ID.unique(), id='pandas-dropdown-1'), # for choosing ID,
html.P("Filter by time interval during nighttime (18:00-6:00):"),
dcc.RangeSlider(
id='range-slider-night',
min=0, max=600, step=10,
marks={0: '0', 50: '50', 100: '100', 150: '150', 200: '200', 250: '250', 300: '300', 350: '350', 400: '400',
450: '450', 500: '500', 550: '550', 600: '600'},
value=[0, 600]
),
html.P("Filter by time interval during daytime (6:00-18:00):"),
dcc.RangeSlider(
id='range-slider-morning',
min=0, max=600, step=10,
marks={0: '0', 50: '50', 100: '100', 150: '150', 200: '200', 250: '250', 300: '300', 350: '350', 400: '400',
450: '450', 500: '500', 550: '550', 600: '600'},
value=[0, 600]
),
dcc.Graph(id="scatter-plot", style={'width': '130vh', 'height': '80vh'}),
html.Div(id='dd-output-container')
])
#app.callback(
Output("scatter-plot", "figure"),
Input("pandas-dropdown-1", "value"),
Input("range-slider-night", "value"),
Input("range-slider-morning", "value"),
prevent_initial_call=True)
def update_lineplot(value, slider_range_night, slider_range_morning):
low_night, high_night = slider_range_night
low_morning, high_morning = slider_range_morning
df_night = df.query("ID == #value & TIME_OF_DAY == 'Night' & TIMEINTERVAL >= #low_night & TIMEINTERVAL < #high_night").copy()
df_morning = df.query("ID == #value & TIME_OF_DAY == 'Morning' & TIMEINTERVAL >= #low_morning & TIMEINTERVAL < #high_morning").copy()
df1 = pd.concat([df_night, df_morning], axis=0).sort_values(['DATE_TIME'])
if df1.shape[0] != 0:
fig = px.line(df1, x="DATE_TIME", y=["VALUE1", "VALUE2"],
facet_col='INSPECTION',
facet_col_wrap=2,
symbol='MODE',hover_data=['TIMEINTERVAL'],
facet_row_spacing=0.1,
facet_col_spacing=0.09)
fig.update_xaxes(matches=None, showticklabels=True)
return fig
else:
return no_update
app.run_server(debug=True, use_reloader=False)
I know that I can do such a task in Flask like in this link. But how can I adapt this solution in Dash?
Please, try it out and tell me if that what you are trying to do:
import random
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
from dash import Dash, html, dcc, Input, Output,dash_table, no_update
df = pd.DataFrame({'DATE_TIME':pd.date_range('2022-11-01', '2022-11-06 23:00:00',freq='20min'),
'ID':[random.randrange(1, 3) for n in range(430)]})
df['VALUE1'] = [random.uniform(110, 160) for n in range(430)]
df['VALUE2'] = [random.uniform(50, 80) for n in range(430)]
df['INSPECTION'] = df['DATE_TIME'].dt.day
df['MODE'] = np.select([df['INSPECTION']==1, df['INSPECTION'].isin([2,3])], ['A', 'B'], 'C')
df['TIME'] = df['DATE_TIME'].dt.time
df['TIME'] = df['TIME'].astype('str')
df['TIMEINTERVAL'] = df.DATE_TIME.diff().astype('timedelta64[m]')
df['TIMEINTERVAL'] = df['TIMEINTERVAL'].fillna(0)
def to_day_period(s):
bins = ['0', '06:00:00', '13:00:00', '18:00:00', '23:00:00', '24:00:00']
labels = ['Nighttime', 'Daytime', 'Daytime', 'Nighttime', 'Nighttime']
return pd.cut(
pd.to_timedelta(s),
bins=list(map(pd.Timedelta, bins)),
labels=labels, right=False, ordered=False
)
df['TIME_OF_DAY'] = to_day_period(df['TIME'])
app = Dash(__name__)
app.layout = html.Div([
html.H4('Interactive Scatter Plot'),
html.Div('Size of dataset = '+str(len(df)),id='size',style={'whiteSpace': 'pre-wrap'}),
html.P("Select ID:"),
dcc.Dropdown(df.ID.unique(), id='pandas-dropdown-1'), # for choosing ID,
html.P("Filter by time interval during nighttime (18:00-6:00):"),
dcc.RangeSlider(
id='range-slider-night',
min=0, max=600, step=10,
marks={0: '0', 50: '50', 100: '100', 150: '150', 200: '200', 250: '250', 300: '300', 350: '350', 400: '400',
450: '450', 500: '500', 550: '550', 600: '600'},
value=[0, 600]
),
html.P("Filter by time interval during daytime (6:00-18:00):"),
dcc.RangeSlider(
id='range-slider-morning',
min=0, max=600, step=10,
marks={0: '0', 50: '50', 100: '100', 150: '150', 200: '200', 250: '250', 300: '300', 350: '350', 400: '400',
450: '450', 500: '500', 550: '550', 600: '600'},
value=[0, 600]
),
dcc.Graph(id="scatter-plot", style={'width': '130vh', 'height': '80vh'}),
html.Div(id='dd-output-container')
])
#app.callback(
Output("scatter-plot", "figure"),
Output("size", "children"),
Input("pandas-dropdown-1", "value"),
Input("range-slider-night", "value"),
Input("range-slider-morning", "value"),
prevent_initial_call=True)
def update_lineplot(value, slider_range_night, slider_range_morning):
low_night, high_night = slider_range_night
low_morning, high_morning = slider_range_morning
df_night = df.query("(ID == #value) & (TIME_OF_DAY == 'Nighttime') & (TIMEINTERVAL >= #low_night) & (TIMEINTERVAL < #high_night)").copy()
df_morning = df.query("(ID == #value) & (TIME_OF_DAY == 'Daytime') & (TIMEINTERVAL >= #low_morning) & (TIMEINTERVAL < #high_morning)").copy()
df1 = pd.concat([df_night, df_morning], axis=0).sort_values(['DATE_TIME'])
text = f"The size of dataset = {len(df)} \n Selected points = {len(df1)}, unselected points = {len(df) - len(df1)}, \n df_night = {len(df_night)}, df_morning = {len(df_morning)}"
if df1.shape[0] != 0:
fig = px.line(df1, x="DATE_TIME", y=["VALUE1", "VALUE2"],
facet_col='INSPECTION',
facet_col_wrap=2,
symbol='MODE',hover_data=['TIMEINTERVAL'],
facet_row_spacing=0.1,
facet_col_spacing=0.09)
fig.update_xaxes(matches=None, showticklabels=True)
return fig,text
else:
return no_update
app.run_server(debug=True, use_reloader=False)
Output:
You will find here how I calculate the sizes ;
text = f"The size of dataset = {len(df)} \n Selected points = {len(df1)}, unselected points = {len(df) - len(df1)}, \n df_night = {len(df_night)}, df_morning = {len(df_morning)}"
Related
I'm writting a Model to perform a classification on images for a school project.
I've 10 classes and I load images in batch on my model :
import torch
import torch.nn as nn
import torch.nn.functional as F
# *****START CODE
class ConvNet(nn.Module):
def __init__(self, in_ch, out_ch):
super(ConvNet, self).__init__()
"""
Number of layers should be exactly same as in the provided JSON.
Do not use any grouping function like Sequential
"""
self.Layer_001 = nn.Conv2d(in_channels=in_ch, out_channels=64, kernel_size=3, padding=1)
self.Layer_002 = nn.ReLU()
self.Layer_003 = nn.MaxPool2d(kernel_size=2,stride=2)
self.Layer_004 = nn.Conv2d(in_channels=64, out_channels=113, kernel_size=3, padding=1)
self.Layer_005 = nn.ReLU()
self.Layer_006 = nn.MaxPool2d(kernel_size=2,stride=2)
self.Layer_007 = nn.Conv2d(in_channels=113, out_channels=248, kernel_size=3, padding=1)
self.Layer_008 = nn.ReLU()
self.Layer_009 = nn.Conv2d(in_channels=248, out_channels=248, kernel_size=3, padding=1)
self.Layer_010 = nn.ReLU()
self.Layer_011 = nn.MaxPool2d(kernel_size=2,stride=2)
self.Layer_012 = nn.Conv2d(in_channels=248, out_channels=519, kernel_size=3, padding=1)
self.Layer_013 = nn.ReLU()
self.Layer_014 = nn.Conv2d(in_channels=519, out_channels=519, kernel_size=3, padding=1)
self.Layer_015 = nn.ReLU()
self.Layer_016 = nn.MaxPool2d(kernel_size=2,stride=2)
self.Layer_017 = nn.Conv2d(in_channels=519, out_channels=519, kernel_size=3, padding=1)
self.Layer_018 = nn.ReLU()
self.Layer_019 = nn.Conv2d(in_channels=519, out_channels=519, kernel_size=3, padding=1)
self.Layer_020 = nn.ReLU()
self.Layer_021 = nn.MaxPool2d(kernel_size=2,stride=2)
self.Layer_022 = nn.AdaptiveAvgPool2d((1,1))
self.Layer_023 = nn.Dropout(p=0.501816987002085)
self.Layer_024 = nn.Linear(in_features=519,out_features=2317)
self.Layer_025 = nn.ReLU()
self.Layer_026 = nn.Linear(in_features=2317, out_features=3018)
self.Layer_027 = nn.Linear(in_features=3018, out_features=3888)
self.Layer_028 = nn.ReLU()
self.Layer_029 = nn.Linear(in_features=3888, out_features=out_ch)
def forward(self, x):
x = self.Layer_001(x)
#print(x.shape)
x = self.Layer_002(x)
#print(x.shape)
x = self.Layer_003(x)
#print(x.shape)
x = self.Layer_004(x)
#print(x.shape)
x = self.Layer_005(x)
#print(x.shape)
x = self.Layer_006(x)
#print(x.shape)
x = self.Layer_007(x)
#print(x.shape)
x = self.Layer_008(x)
#print(x.shape)
x = self.Layer_009(x)
#print(x.shape)
x = self.Layer_009(x)
#print(x.shape)
x = self.Layer_010(x)
#print(x.shape)
x = self.Layer_011(x)
#print(x.shape)
x = self.Layer_012(x)
#print(x.shape)
x = self.Layer_013(x)
#print(x.shape)
x = self.Layer_014(x)
#print(x.shape)
x = self.Layer_015(x)
#print(x.shape)
x = self.Layer_016(x)
#print(x.shape)
x = self.Layer_017(x)
#print(x.shape)
x = self.Layer_018(x)
#print(x.shape)
x = self.Layer_019(x)
#print(x.shape)
x = self.Layer_020(x)
#print(x.shape)
x = self.Layer_021(x)
#print(x.shape)
x = self.Layer_022(x)
#print(x.shape)
x = self.Layer_023(x)
#print(x.shape)
#x = nn.Flatten(x)
##print(x.shape)
x = self.Layer_024(x)
#print(x.shape)
x = self.Layer_025(x)
#print(x.shape)
x = self.Layer_026(x)
#print(x.shape)
x = self.Layer_027(x)
#print(x.shape)
x = self.Layer_028(x)
#print(x.shape)
output = self.Layer_029(x)
print(output.shape)
return output
# *****END CODE
And when I run it I have an error with between layers
it return this error :
RuntimeError: mat1 and mat2 shapes cannot be multiplied (8304x1 and 519x2317)
I understand it a shape problem but I'm learning and don't understand where it's happening...
I'm trying to rebuild this architecture:
'Layer_001': {'input': 3,
'kernel_size': 3,
'output': 64,
'padding': 1,
'type': 'Conv2d'},
'Layer_002': {'type': 'ReLU'},
'Layer_003': {'kernel_size': 2, 'stride': 2, 'type': 'MaxPool2d'},
'Layer_004': {'input': 64,
'kernel_size': 3,
'output': 113,
'padding': 1,
'type': 'Conv2d'},
'Layer_005': {'type': 'ReLU'},
'Layer_006': {'kernel_size': 2, 'stride': 2, 'type': 'MaxPool2d'},
'Layer_007': {'input': 113,
'kernel_size': 3,
'output': 248,
'padding': 1,
'type': 'Conv2d'},
'Layer_008': {'type': 'ReLU'},
'Layer_009': {'input': 248,
'kernel_size': 3,
'output': 248,
'padding': 1,
'type': 'Conv2d'},
'Layer_010': {'type': 'ReLU'},
'Layer_011': {'kernel_size': 2, 'stride': 2, 'type': 'MaxPool2d'},
'Layer_012': {'input': 248,
'kernel_size': 3,
'output': 519,
'padding': 1,
'type': 'Conv2d'},
'Layer_013': {'type': 'ReLU'},
'Layer_014': {'input': 519,
'kernel_size': 3,
'output': 519,
'padding': 1,
'type': 'Conv2d'},
'Layer_015': {'type': 'ReLU'},
'Layer_016': {'kernel_size': 2, 'stride': 2, 'type': 'MaxPool2d'},
'Layer_017': {'input': 519,
'kernel_size': 3,
'output': 519,
'padding': 1,
'type': 'Conv2d'},
'Layer_018': {'type': 'ReLU'},
'Layer_019': {'input': 519,
'kernel_size': 3,
'output': 519,
'padding': 1,
'type': 'Conv2d'},
'Layer_020': {'type': 'ReLU'},
'Layer_021': {'kernel_size': 2, 'stride': 2, 'type': 'MaxPool2d'},
'Layer_022': {'output': 'COMPUTE', 'type': 'AdaptiveAvgPool2d'},
'Layer_023': {'p': 0.501816987002085, 'type': 'Dropout'},
'Layer_024': {'input': 'COMPUTE', 'output': 2317, 'type': 'Linear'},
'Layer_025': {'type': 'ReLU'},
'Layer_026': {'input': 2317, 'output': 'COMPUTE', 'type': 'Linear'},
'Layer_027': {'input': 3018, 'output': 3888, 'type': 'Linear'},
'Layer_028': {'type': 'ReLU'},
'Layer_029': {'input': 3888, 'output': 'COMPUTE', 'type': 'Linear'}
I think my error come from 'Layer_022': {'output': 'COMPUTE', 'type': 'AdaptiveAvgPool2d'} or from this on 'Layer_024': {'input': 'COMPUTE', 'output': 2317, 'type': 'Linear'} but I'm not sure... I mean I don't really now how to compute theses values, and that's why I'm asking for some help :)
I already try to put 519 on the output of 'Layer_022': {'output': 'COMPUTE', 'type': 'AdaptiveAvgPool2d'}, I tried also different values likes (2) (2,2)...
You need to put a nn.Flatten() in there. You've created a flatten layer in your code, but you need to put it in like the others. A similar method would be to call x = self.Layer_023(x).view([x.shape[0],-1]) in your forward call in order to get a size of [batch x feats].
For example:
In [3]: a = torch.randn([16,200,1,1])
In [4]: b = torch.nn.Linear(200,100)
In [5]: b(a)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (3200x1 and 200x100)
In [6]: b(a.view([a.shape[0],-1]))
Out[6]:
tensor([[ 1.2927, -0.0799, 0.3909, ..., 0.5051, 0.4727, -0.1759],
[-0.2969, 0.2622, 0.6283, ..., -0.8404, -0.7275, -0.2853],
[ 0.3116, 0.2436, -1.0069, ..., 1.9674, -0.3689, -0.1099],
...,
[-0.6393, 0.3817, 0.0246, ..., 0.1511, -0.9695, 0.6455],
[ 0.0390, -0.7878, 0.3007, ..., 0.8577, -0.2808, -0.2726],
[ 0.1561, 0.0472, -0.0222, ..., 0.9957, -0.4121, -0.1465]],
grad_fn=<AddmmBackward0>)
Im using django with postgres i was able to add multiple filters in my views but mu question here is is there any possibility that i can filter multiple same jsonfield with different values:
ex i can filter localhost:127.0.0.1:products?areaOfUse=residential
so is there any possibilty that i can get the result of /products?areaOfUse=residential&areaOfUse=test
So from here i need to query two different json objects.
-Here are my views
class SubcategoriesProductsAPI(APIView):
# #cache_control(must_revalidate=True, max_age=3600)
def get (self, request, subCategoryId = None, pk = None):
try:
filters = {}
design = self.request.query_params.get('design', None)
dimension = self.request.query_params.get('dimension', None)
collectionName = self.request.query_params.get('collectionName', None)
material = self.request.query_params.get('material',None)
min_price = self.request.query_params.get('min_price',None)
max_price = self.request.query_params.get('max_price',None)
page = self.request.query_params.get('page', None)
wearLayer = self.request.query_params.get('wearLayer',None)
areaOfUse = self.request.query_params.getlist('areaOfUse',None)
productType = self.request.query_params.get('type', None)
installationMethod = self.request.query_params.get('installationMethod',None)
format_type = self.request.query_params.get('format_type',None)
wearLayer = self.request.query_params.get('wearLayer',None)
levelOfUse = self.request.query_params.get('levelOfUse',None)
if design is not None:
filters['product_options__options__data__design'] = design
if productType is not None:
filters['product_options__options__data__type'] = productType
if dimension is not None:
filters['product_options__options__data__dimensions__contains'] = [{'dimension': dimension}]
if collectionName is not None:
filters['product_options__options__data__collectionName'] = collectionName
if material is not None:
filters['product_options__options__data__material'] = material
if wearLayer is not None:
filters['product_options__options__data__wearLayer'] = wearLayer
if installationMethod is not None:
filters['product_options__options__data__installationMethod'] =installationMethod
if format_type is not None:
filters['product_options__options__data__format'] = format_type
if areaOfUse is not None:
filters['product_options__options__data__areaOfUse__contains'] = areaOfUse
if levelOfUse is not None:
filters['product_options__options__data__levelOfUse'] = levelOfUse
if min_price and max_price:
filters['product_options__options__data__dimensions__range__price'] = float(min_price)
filters['product_options__options__data__dimensions__0__price__lte'] = float(max_price)
queryset = Products.objects.filter(sub_categories_id = subCategoryId, is_active = True).select_related().filter(**filters)
if not queryset:
return JsonResponse({ 'status': False, 'msg': 'No products found', 'data': {} }, status=400)
if page is not None:
paginator = PageNumberPagination()
page = paginator.paginate_queryset(queryset, request)
if page is not None:
serializer = ProductSerializer(page, many=True)
return JsonResponse({ 'status': True, 'msg': 'Succesfully retrived products ', 'data': serializer.data, 'count': paginator.page.paginator.count, 'previous':paginator.get_previous_link(), 'next':paginator.get_next_link() }, status=200)
serializer = ProductSerializer(queryset, many=True)
return JsonResponse({ 'status': True, 'msg': 'Succesfully retrived products ', 'data': serializer.data }, status=200)
except Products.DoesNotExist:
return JsonResponse({ 'status': False, 'msg': 'Internal system error', 'data': {}}, status=500)
areaOfUse = self.request.query_params.getlist('areaOfUse[]',None)
/products?areaOfUse%5B%5D=residential&areaOfUse%5B%5D=test
import operator
from django.db.models import Q
from functools import reduce
queryset = Products.objects.filter(sub_categories_id = subCategoryId, is_active = True).select_related().filter(**filters)
if areaOfUse:
queryset.filter(
reduce(
operator.and_,
(Q(product_options__options__data__areaOfUse__contains=x) for x in areaOfUse)
)
)
##This is a container that I am using in the data table, but br tag in the column name is not working.
I used escape = FALSE in the data table but still facing the same issue.
and \n is also not working. I want something like this column name = first name (in the next line) last name.
test <- function(group,n){
htmltools::withTags(th(colspan = n, group, class = "dt-center"))
}
myContainer <- htmltools::withTags(table(
class = '',style="width:100%",
thead(
tr(
th(rowspan = 2, ' '),
th(colspan = 1, 'group 1', class = "dt-center"),
th(colspan = 2, 'group 2', class = "dt-center"),
th(colspan = 2, 'group 3', class = "dt-center")
),
tr(
th("new \\\\n ID"),
lapply(c("SUBJID","SITE<br>ID","AG<br>E","SUBJID","RACE"), th)
)
)
))
Server <- function(input, output, session) {
adae<-read_sas("C:/Arinjay_Intern/Work/ADaM/adae.sas7bdat")
output$intTable<-renderDT({adae_df %>%
datatable(class= 'compact', extensions = 'Buttons', rownames = F, container = myContainer,escape = FALSE,
callback = JS(c("$('table.dataTable thead th').css('border-top', 'none');",
"$('table.dataTable.no-footer').css('border-top', 'none');"
)),
options = list(dom = 'tB', pageLength = 5,
ordering = FALSE, class= "compact",
columnDefs = list(list(className = "dt-center", targets = "_all")),
buttons = 'pdf'
),
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: left;',
htmltools::em(HTML('N = number of subjects in the specified population. <br>n=number of subjects in each category. % = 100*n/N.')))
) %>%
formatStyle(c("USUBJID","SUBJID","SITEID","AGE", "SEX","RACE"), backgroundColor = 'white')
})
}
UI <-navbarPage(
"DT Interactive Tables",
tabPanel(
"ADaM DataSets",
fluidPage(
checkboxGroupInput('group','Please select a group',c('FD_Cohort','MRD_Cohort')),
textInput('n',"any value",value=2),
DTOutput("intTable")
)
)
)
shinyApp(UI,Server)
Expected output:
Neither \n or <br>, work in xtable. So, you could define the rows explicitly as shown below:
row1 <- c(" USUB","SUBJ","SITE","AG","SEX", "RACE")
row2 <- c("JID","ID","ID","E"," ", "")
myContainer <- htmltools::withTags(table(
class = 'dt-center', style="width:100%",
thead(
tr(
th(colspan = 2, 'group 1', class = "dt-center"),
th(colspan = 2, 'group 2', class = "dt-center"),
th(colspan = 2, 'group 3', class = "dt-center")
),
tr( lapply( row1, th)
),
tr( lapply( row2, th)
)
)
))
or you can write something in css or js to handle it. The above code gives the following output on a dummy dataset:
I am trying in this way:
myContainer <- htmltools::withTags(table(
class = '',style="width:100%",
thead(
tr(
th(colspan = 2, 'Group 1', class = "dt-center"),
th(colspan = 2, 'Group 2', class = "dt-center")
),
tr(
lapply(names(data.frame(df)), th)
)
)
))
#UI:
library(shiny)
shinyUI <- fluidPage(
fluidRow(
htmlOutput("TextTable")
)
My rendertable in server:
output$TextTable <- renderTable({
varsub
}, width = "100%", include.colnames = TRUE,
sanitize.text.function = identity, spacing = 's',
container = myContainer,
columnDefs = list(list(className = "dt-center", targets = "_all"))
)
But this is not working, any help would be appreciated. Or please suggest any other way to do it.
I have a column that contains JSON data as in the following example,
library(data.table)
test <- data.table(a = list(1,2,3),
info = list("{'duration': '10', 'country': 'US'}",
"{'duration': '20', 'country': 'US'}",
"{'duration': '30', 'country': 'GB', 'width': '20'}"))
I want to convert the last column to equivalent R storage, which would look similar to,
res <- data.table(a = list(1, 2, 3),
duration = list(10, 20, 30),
country = list('US', 'US', 'GB'),
width = list(NA, NA, 20))
Since I have 500K rows with different contents I would look for a quick way to do this.
A variation without the need to separate out the JSON string
library(data.table)
library(jsonlite)
test[, info := gsub("'", "\"", info)]
test[, rbindlist(lapply(info, fromJSON), use.names = TRUE, fill = TRUE)]
# duration country width
# 1: 10 US NA
# 2: 20 US NA
# 3: 30 GB 20
Parse the JSON first, then build the data.frame (or data.table):
json_string <- paste(c("[{'duration': '10', 'country': 'US'}",
"{'duration': '20', 'country': 'US'}",
"{'duration': '30', 'country': 'GB'}",
"{'width': '20'}]"), collapse=", ")
# JSON standard requires double quotes
json_string <- gsub("'", "\"", json_string)
library("jsonlite")
fromJSON(json_string)
# duration country width
# 1 10 US <NA>
# 2 20 US <NA>
# 3 30 GB <NA>
# 4 <NA> <NA> 20
This isn't exactly what you asked for as your JSON doesn't associate 'width' with the previous record, you might need to do some manipulation first:
json_string <- paste(c("[{'duration': '10', 'country': 'US'}",
"{'duration': '20', 'country': 'US'}",
"{'duration': '30', 'country': 'GB', 'width': '20'}]"),
collapse=", ")
json_string <- gsub("'", "\"", json_string)
df <- jsonlite::fromJSON(json_string)
data.table::as.data.table(df)
# duration country width
# 1: 10 US NA
# 2: 20 US NA
# 3: 30 GB 20