Extracting JSON-data from CSV file - json

I'm trying to extract a JSON data which is a column in a CSV file. So far I've come to the point where I've extracted the column in the right format, but the formatting is only correct when the variable type is factor. But I can't convert a factor to a json-file using the jsonlite package.
[1] {"id":509746197991998767,"visibility":{"percentage":100,"time":149797,"visible1":true,"visible2":false,"visible3":false,"activetab":true},"interaction":{"mouseovercount":1,"mouseovertime":1426,"videoplaytime":0,"engagementtime":0,"expandtime":0,"exposuretime":35192}}
Another approach is to use stringsAsFactors = F when importing, but I'm struggling in getting the formatting right, where each entry looks like this:
[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"
Am I missing something obvious here? I simply just want to exract the JSON files that sits inside a CSV file.
Heres a small example of the CSV file:
"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"
Regards,
Frederik.

df <- readr::read_csv('"","CookieID","UnloadVars"
"1",-8857188784608690176,"{""id"":509746197991998767,""visibility"":{""percentage"":100,""time"":149797,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":1,""mouseovertime"":1426,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":35192}}"
"2",-1695626857458244096,"{""id"":2917654329769114342,""visibility"":{""percentage"":46,""time"":0,""visible1"":false,""visible2"":false,""visible3"":false,""activetab"":true}}"
"3",437299165071669184,"{""id"":2252707957388071809,""visibility"":{""percentage"":99,""time"":10168,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":542},""clicks"":[{""x"":105,""y"":449}]}"
"4",292660729552227520,""
"5",7036383942916227072,"{""id"":2299674593327687292,""visibility"":{""percentage"":76,""time"":1145,""visible1"":true,""visible2"":false,""visible3"":false,""activetab"":true},""interaction"":{""mouseovercount"":0,""mouseovertime"":0,""videoplaytime"":0,""engagementtime"":0,""expandtime"":0,""exposuretime"":74},""clicks"":[{""x"":197,""y"":135},{""x"":197,""y"":135}]}"',
col_types = "-cc")
Using jsonlite::fromJSON on each separate value, then tidyr::unnest
library(dplyr)
f <- function(.x)
if (is.na(.x) || .x == "") data.frame()[1, ] else
as.data.frame(jsonlite::fromJSON(.x))
df %>%
tidyr::unnest(UnloadVars = lapply(UnloadVars, f)) %>%
mutate_at(vars(ends_with("id")), as.character)
# A tibble: 6 x 16
# CookieID id visibility.percentage visibility.time visibility.visible1 visibility.visible2 visibility.visible3 visibility.activetab interaction.mouseovercount interaction.mouseovertime interaction.videoplaytime interaction.engagementtime interaction.expandtime interaction.exposuretime clicks.x clicks.y
# <chr> <chr> <int> <int> <lgl> <lgl> <lgl> <lgl> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 -8857188784608690176 509746197991998784 100 149797 TRUE FALSE FALSE TRUE 1 1426 0 0 0 35192 NA NA
# 2 -1695626857458244096 2917654329769114112 46 0 FALSE FALSE FALSE TRUE NA NA NA NA NA NA NA NA
# 3 437299165071669184 2252707957388071936 99 10168 TRUE FALSE FALSE TRUE 0 0 0 0 0 542 105 449
# 4 292660729552227520 <NA> NA NA NA NA NA NA NA NA NA NA NA NA NA NA
# 5 7036383942916227072 2299674593327687168 76 1145 TRUE FALSE FALSE TRUE 0 0 0 0 0 74 197 135
# 6 7036383942916227072 2299674593327687168 76 1145 TRUE FALSE FALSE TRUE 0 0 0 0 0 74 197 135

I used readr::read_csv to read in your sample data set.
> df <- readr::read_csv('~/sample.csv')
Parsed with column specification:
cols(
CookieID = col_double(),
UnloadVars = col_character()
)
As you can see the UnloadVars are read in as characters and not factors. If I now examine the first value in the UnloadVars columns I see the following which matches what you get,
> df$UnloadVars[1]
[1] "{\"id\":509746197991998767,\"visibility\":{\"percentage\":100,\"time\":149797,\"visible1\":true,\"visible2\":false,\"visible3\":false,\"activetab\":true},\"interaction\":{\"mouseovercount\":1,\"mouseovertime\":1426,\"videoplaytime\":0,\"engagementtime\":0,\"expandtime\":0,\"exposuretime\":35192}}"
Now, I use jsonlite::fromJSON,
> j <- jsonlite::fromJSON(df$UnloadVars[1])
> j
$id
[1] 5.097462e+17
$visibility
$visibility$percentage
[1] 100
$visibility$time
[1] 149797
$visibility$visible1
[1] TRUE
$visibility$visible2
[1] FALSE
$visibility$visible3
[1] FALSE
$visibility$activetab
[1] TRUE
$interaction
$interaction$mouseovercount
[1] 1
$interaction$mouseovertime
[1] 1426
$interaction$videoplaytime
[1] 0
$interaction$engagementtime
[1] 0
$interaction$expandtime
[1] 0
$interaction$exposuretime
[1] 35192
Which I believe is what you need since JSONs are parsed as lists in R.

It can be very tricky to deal with JSON data. As a general guide line, you should always strive to have your data in a data frame. This, however, is not always possible. In the specific case, I don't see a way you can have both visibility and interaction values at once in a nicely formatted data frame.
What I will do next is to extract the information from interaction into a data frame.
Load required packages and read the data
library(purrr)
library(dplyr)
library(tidyr)
df <- read.csv("sample.csv", stringsAsFactors = FALSE)
Then remove unvalid JSON
# remove rows without JSON (in this case, the 4th row)
df <- df %>%
dplyr::filter(UnloadVars != "")
Transform each JSON into a list and put them into UnloadVars column. If you didn't know that, it is possible to have list column in a data frame. This can be very useful.
out <- data_frame(CookieID = numeric(),
UnloadVars = list())
for (row in 1:nrow(df)) {
new_row <- data_frame(CookieID = df[row, ]$CookieID,
UnloadVars = list(jsonlite::fromJSON(df[row, ]$UnloadVars)))
out <- bind_rows(out, new_row)
}
out
We can now extract the IDs from the lists in Unload Vars. This is straight forward because there is only one ID per list.
out <- out %>%
mutate(id = map_chr(UnloadVars, ~ .$id))
This final part can seem a bit intimidating. But what I am doing here is taking interaction part from UnloadVars column and putting it into a interaction column. I then transform each row from interaction, which is a list, into a data frame with two columns: key and value. key contains the name of the interaction metric and value its value. I finally unnest it, so we get rid of list columns and end up with a nicely formatted data frame.
unpack_list <- function(obj, key_name) {
as.data.frame(obj) %>%
gather(key) %>%
return()
}
df_interaction <- out %>%
mutate(interaction = map(UnloadVars, ~ .$interaction)) %>%
mutate(interaction = map(interaction, ~ unpack_list(.x, key))) %>%
unnest(interaction)
df_interaction
The solution is not very elegant, but gets the job done. You could apply the same logic to extract information from visibility.

Related

How can I filter out numbers from an html table in R?

I am currently working on a forecasting model and to do this I would like to import data from an HTML website into R and save the values-part of the data set into a new list.
I have used the following approach in R:
# getting website data:
link <- "https://www.tradegate.de/orderbuch.php?isin=US13200M5085"
document <- htmlParse(GET(link, user_agent("Mozilla")))
removeNodes(getNodeSet(document,"//*/comment()"))
doc.tables<-readHTMLTable(document)
# show BID/ASK block:
doc.tables[2]
Which (doc.tables[2]) gives me in this case the result:
$`NULL`
Bid 0,765
1 Ask 0,80
How can i filter out the numbers (0,765 & 0,80) of the table, to save it into a list?
The issue is the 0.765 is actually the name of your data.frame column.
Your data frame being doc.tables[[2]]
You can grab the name by calling names(doc.tables[[2]])[2])
store that as a variable like name <- names(doc.tables[[2]])[2])
then you can grab the 0,80 by using doc.tables[[2]][[2]], store that as a variable if you like.
Final code should look like... my_list <- list(name, doc.tables[[2]][[2]])
Here is a way with rvest, not package XML.
The code below uses two more packages, stringr and readr, to extract the values and their names.
library(httr)
library(rvest)
library(dplyr)
link <- "https://www.tradegate.de/orderbuch.php?isin=US13200M5085"
page <- read_html(link)
tbl <- page %>%
html_elements("tr") %>%
html_text() %>%
.[3:4] %>%
stringr::str_replace_all(",", ".")
tibble(name = stringr::str_extract(tbl, "Ask|Bid"),
value = readr::parse_number(tbl))
#> # A tibble: 2 x 2
#> name value
#> <chr> <dbl>
#> 1 Bid 0.765
#> 2 Ask 0.8
Created on 2022-03-26 by the reprex package (v2.0.1)
Without saving the pipe result to a temporary object, tbl, the pipe can continue as below.
library(httr)
library(rvest)
library(stringr)
suppressPackageStartupMessages(library(dplyr))
link <- "https://www.tradegate.de/orderbuch.php?isin=US13200M5085"
page <- read_html(link)
page %>%
html_elements("tr") %>%
html_text() %>%
.[3:4] %>%
str_replace_all(",", ".") %>%
tibble(name = str_extract(., "Ask|Bid"),
value = readr::parse_number(.)) %>%
.[-1]
#> # A tibble: 2 x 2
#> name value
#> <chr> <dbl>
#> 1 Bid 0.765
#> 2 Ask 0.8
Created on 2022-03-27 by the reprex package (v2.0.1)
This is building on Jahi Zamy’s observation that some of your data are showing up as column names and on the example code in the question.
library(httr)
library(XML)
# getting website data:
link <- "https://www.tradegate.de/orderbuch.php?isin=US13200M5085"
document <- htmlParse(GET(link, user_agent("Mozilla")))
# readHTMLTable() assumes tables have a header row by default,
# but these tables do not, so use header=FALSE
doc.tables <- readHTMLTable(document, header=FALSE)
# Extract column from BID/ASK table
BidAsk = doc.tables1[[2]][,2]
# Replace commas with point decimal separator
BidAsk = as.numeric(gsub(",", ".", BidAsk))
# Convert to numeric
BidAsk = as.numeric(BidAsk)

R: Vector of JSONs to data.frame

I have a vector of JSONs (of the same structure) and transform it to a data.frame. Following example does exactly what I want.
require(jsonlite) # fromJSON()
require(magrittr) # for the pipeline only
require(data.table) # rbindlist()
jsons <- c('{"num":1,"char":"a","list":{"x":1,"y":2}}',
'{"num":2,"char":"b","list":{"x":1,"y":2}}',
'{"num":3,"char":"c","list":{"x":1,"y":2}}')
df <- jsons %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
Some elements of the JSON are objects, i.e. if I transform it fromJSON() some elements of the list will be lists as well. I cannot use unlist() to each list because I have different variable types so I am using as.data.frame.list() function. This is however too slow to do for each JSON individually. Is there a way how can I do it more effectively?
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
system.time(
df <- json %>% rep(1000) %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
) # 2.72
I know that there are plenty of similar questions but most of the answers I saw was about using as.data.frame() or data.frame(). Nobody mentioned the speed. Maybe there is no better solution to this.
I finally found the answer. It will be on CRAN soon.
devtools::install_github("jeremystan/tidyjson")
tidyjson::spread_all()
This function is about 10-times faster than my example above.
Try to collapse all JSONs in the one string. Let's show example of the solution:
require(jsonlite)
require(data.table)
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
n <- 1000
ex <- rep(json, 1000)
f1 <- function(x) {
res <- lapply(x, fromJSON)
res <- lapply(res, as.data.frame.list, stringsAsFactors = FALSE)
res <- rbindlist(res, fill = TRUE)
return(res)
}
f2 <- function(x) {
res <- fromJSON(paste0("[", paste(x, collapse = ","), "]"), flatten = TRUE)
lst <- sapply(res, is.list)
res[lst] <- lapply(res[lst], function(x) as.data.table(transpose(x)))
res <- flatten(res)
return(res)
}
bench::mark(
f1(ex), f2(ex), min_iterations = 100, check = FALSE
)
#> # A tibble: 2 x 14
#> expression min mean median max `itr/sec` mem_alloc n_gc n_itr #> total_time result memory time
#> <chr> <bch:t> <bch:t> <bch:t> <bch:tm> <dbl> <bch:byt> <dbl> <int> #> <bch:tm> <list> <list> <lis>
#> 1 f1(ex) 2.27s 2.35s 2.32s 2.49s 0.425 0B 5397 100 #> 3.92m <data… <Rpro… <bch…
#> 2 f2(ex) 48.85ms 63.78ms 57.88ms 116.19ms 15.7 0B 143 100 #> 6.38s <data… <Rpro… <bch…
#> # … with 1 more variable: gc <list>

R loops with JSON API Source

I'm trying to get data for books prices from API (http://www.knigoed.info/api-prices.html) based on ISBN.
The idea is to submit vector of ISBNs to the function to get a data frame with all available info (or at least Data.Frame with prices from different vendors)
isbns<- c("9785170922789", "9785170804801", "9785699834174", "9785699717255", "9785170869237")
getISBNprice <- function(ISBN, source="http://www.knigoed.info/api/Prices?code=") {
pathA <- source
for (i in 1:length(ISBN)) {
ISB <- ISBN[i]
AAA <- paste(pathA, ISB, "&sortPrice=DESC&country=RU", sep="")
document <- fromJSON(AAA, flatten = FALSE)
dfp <- document$prices
dfp <- cbind(dfp,ISB )
# dfp <- cbind(dfp,BookID=document$bookId)
# dfp <- cbind(dfp,Title=document$title)
# dfp <- cbind(dfp,Author=document$author)
# dfp <- cbind(dfp,Publisher=document$publisher)
# dfp <- cbind(dfp,Series=document$series)
# dfp <- cbind(dfp,Picture=document$picture)
if (!exists("AAAA")) {AAAA<- dfp} else {bind_rows(AAAA, dfp) }
}
AAAA
}
But the function returns error:
1. In bind_rows_(x, .id) : Unequal factor levels: coercing to character
2: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
3: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
4: In bind_rows_(x, .id) : Unequal factor levels: coercing to character
It's easiest make a list from the start, which will make simplifying later easier. The purrr package can make working with lists much easier, though the usages here can be replaced with base's lapply and mapply/Map if you prefer.
library(purrr)
# Paste is vectorized, so make a list of URLs all at once.
# `httr` can make a URL out of a list of named parameters, if it's more convenient.
results <- paste0("http://www.knigoed.info/api/Prices?code=",
isbns,
"&sortPrice=DESC&country=RU") %>%
# Iterate over vector of URLs, using fromJSON to pull and parse the request.
# map, like lapply, will put the results into a list.
map(jsonlite::fromJSON, flatten = FALSE)
# Grab "prices" element of each top-level list element
results %>% map('prices') %>%
# Iterate in parallel (like mapply/Map) over prices and isbns, making a data.frame of
# each. map2_df will coerce the resulting list of data.frames to a single data.frame.
map2_df(isbns, ~data.frame(isbn = .y, .x, stringsAsFactors = FALSE)) %>%
# For pretty printing
tibble::as_data_frame()
## # A tibble: 36 x 10
## isbn shopId name domain
## <chr> <chr> <chr> <chr>
## 1 9785170922789 29 Магистр booka.ru
## 2 9785170922789 3 Лабиринт labirint.ru
## 3 9785170922789 20 LitRes.ru litres.ru
## 4 9785170804801 29 Магистр booka.ru
## 5 9785170804801 2 Read.ru read.ru
## 6 9785170804801 3 Лабиринт labirint.ru
## 7 9785170804801 63 Эксмо eksmo.ru
## 8 9785170804801 1 OZON.ru ozon.ru
## 9 9785170804801 4 My-shop.ru my-shop.ru
## 10 9785170804801 1 OZON.ru ozon.ru
## # ... with 26 more rows, and 6 more variables: url <chr>, available <lgl>, downloadable <lgl>,
## # priceValue <dbl>, priceSuffix <chr>, year <int>

Parsing JSON URL in R with different number of fields

I'm having a lot of trouble trying to read some JSON data obtained from a URL in R. I'm able to read in the data, and call on each observation to get the values (as characters which is fine), but I can't seem to find a way to get the data in a table format (basically like in excel).
I've tried to create a loop which calls on each field to place it in an empty matrix, however not every object has the same number of fields (ie. some values have Label1 and Label2, while others just have Label1). I get the error that the subscipts are out of bounds. What I was thinking was to make a conditional statement whereas if the field existed then the value of the field would be put in the data matrix, and if the field does not exist then I would insert an NA. I get a subscript error automatically though and cannot do the conditional evaluation - I've looked to see if I can coerce an error to become an NA, but I don't think this is possible.
I'm starting the index from j=3, since the first two observations in the JSON code are not needed for me. My problem is that for example "json$poi[[j]]$label[[2]]$value" may not exist for every observation and I automatically get an error when the code comes across the first observation missing this field.
The data is quite big - around 4480 observations with up to 20 fields each. I only require the 9 fields I have listed however. Here is a link to the data URL - it may take some time to load. Im quite new to coding, and especially trying to deal with JSON files, so my apology if this has a simple solution that I'm not seeing.
Thanks!
http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1
library(rjson)
library(RCurl)
json <- fromJSON(getURL('http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1'))
ljson <- length(json$poi)-2
data <- matrix(data=NA, nrow=ljson, ncol=9)
for(i in 1:ljson)
{
j <- i+2
d1 <- json$poi[[j]]$location$point[[1]]$Point$posList
d2 <- json$poi[[j]]$label[[1]]$value
d3 <- json$poi[[j]]$label[[2]]$value
d4 <- json$poi[[j]]$category[[1]]$value
d5 <- json$poi[[j]]$category[[2]]$value
d6 <- json$poi[[j]]$id
d7 <- json$poi[[j]]$author$value
d8 <- json$poi[[j]]$license$value
d9 <- json$poi[[j]]$description[[1]]$value
if(exists("d1") == TRUE){
d1 <- json$poi[[j]]$location$point[[1]]$Point$posList
} else {
d1 <- NA
}
if(exists("d2") == TRUE){
d2 <- json$poi[[j]]$label[[1]]$value
} else {
d2 <- NA
}
if(exists("d3") == TRUE){
d3 <- json$poi[[j]]$label[[2]]$value
} else {
d3 <- NA
}
if(exists("d4") == TRUE){
d4 <- json$poi[[j]]$category[[1]]$value
} else {
d4 <- NA
}
if(exists("d5") == TRUE){
d5 <- json$poi[[j]]$category[[2]]$value
} else {
d5 <- NA
}
if(exists("d6") == TRUE){
d6 <- json$poi[[j]]$id
} else {
d6 <- NA
}
if(exists("d7") == TRUE){
d7 <- json$poi[[j]]$author$value
} else {
d7 <- NA
}
if(exists("d8") == TRUE){
d8 <- json$poi[[j]]$license$value
} else {
d8 <- NA
}
if(exists("d9") == TRUE){
d9 <- json$poi[[j]]$description[[1]]$value
} else {
d9 <- NA
}
data[i,] <- rbind(c(d1,d2,d3,d4,d5,d6,d7,d8,d9))
}
For JSON & XML list structures str is your friend! You can use that to inspect all or portions of a list structure. sapply on individual components to extract is probably better than the for construct and you'll need to handle NULLs and missing sub-structure components to build a data frame from that JSON (and many JSON files, actually). The following gets you started, but you still have some work to do:
# simplify extraction (saves typing, too)
poi <- json$poi
# start at 3rd element
poi <- poi[3:length(poi)]
# have to do some special checking since the value isn't always there
poi_points <- sapply(poi, function(x) {
if ("point" %in% names(x$location) & length(x$location$point) > 0) {
x$location$point[[1]]$Point$posList
} else {
NA
}
})
# this removes NULLs which the data.frame call won't like later
poi_description <- sapply(poi, function(x) {
if (is.null(x$description[[1]]$value)) {
NA
} else {
x$description[[1]]$value
}
})
# this removes NULLs which the data.frame call won't like later
poi_category <- sapply(poi, function(x) {
if (is.null(x$category[[1]]$value)) {
NA
} else {
x$category[[1]]$value
}
})
# simpler extractions
poi_label <- sapply(poi, function(x) x$label[[1]]$value)
poi_id <- sapply(poi, function(x) x$id)
poi_author <- sapply(poi, function(x) x$author$value)
poi_license <- sapply(poi, function(x) x$license$value)
# make a data frame
poi <- data.frame(poi_label, poi_category, poi_id, poi_points, poi_author, poi_license, poi_description)
str(poi)
## 'data.frame': 4482 obs. of 7 variables:
## $ poi_label : Factor w/ 4482 levels "\"Bloco das Águas Livres\", edifício de habitação, comércio e serviços",..: 363 765 764 1068 174 419 461 762 420 412 ...
## $ poi_category : Factor w/ 129 levels "Acessórios de Uso Pessoal",..: 33 33 33 33 33 33 123 33 33 33 ...
## $ poi_id : Factor w/ 4482 levels "52d7bf4d723e8e0b0cc08b69",..: 2 3 4 5 7 8 15 16 17 18 ...
## $ poi_points : Factor w/ 3634 levels "38.405892 -9.93503",..: 975 244 478 416 301 541 2936 2975 2850 2830 ...
## $ poi_author : Factor w/ 1 level "CitySDK": 1 1 1 1 1 1 1 1 1 1 ...
## $ poi_license : Factor w/ 1 level "open-data": 1 1 1 1 1 1 1 1 1 1 ...
## $ poi_description: Factor w/ 2831 levels "","\n","\n\n",..: 96 1051 NA NA 777 1902 NA 1038 81 82 ...
##

Community detection with bipartite graph in igraph

I have bipartite list (posts, word categories) with 1000 vertecies and want to use the fast and greedy algorithm for community detection, but I am not sure if I have to run it on the bipartite graph or the bipartite projection.
My bipartite list looks like this:
post word
1 66 2
2 312 1
3 432 7
4 433 7
5 434 1
6 435 5
7 436 1
8 437 4
When I run it without a projection I have problems clustering in the second step:
### Load bipartie list and create graph ###
bipartite_list <- read.csv("bipartite_list_tnf.csv", header = TRUE, sep = ";")
bipartite_graph <- graph.incidence(bipartite_list)
g<-bipartite_graph
fc <- fastgreedy.community(g) ## communities / clusters
set.seed(123)
l <- layout.fruchterman.reingold(g, niter=1000, coolexp=0.5) ## layout
membership(fc)
# 2. checking who is in each cluster
cl <- data.frame(name = fc$post, cluster = fc$membership, stringsAsFactors=F)
cl <- cl[order(cl$cluster),]
cl[cl$cluster==1,]
# 3. preparing data for plot
d <- data.frame(l); names(d) <- c("x", "y")
d$cluster <- factor(fc$membership)
# 4. plot with only nodes, colored by cluster
p <- ggplot(d, aes(x=x, y=y, color=cluster))
pq <- p + geom_point()
pq
Maybe I have to run the communnity detection on a projection? But then I always get I failure because a projection is not a graph object:
bipartite_graph <- graph.incidence(bipartite_list)
#projection (both directions)
projection_word_post <- bipartite.projection(bipartite_graph)
fc <- fastgreedy.community(projection_word_post)
Fehler in fastgreedy.community(projection_word_post) : Not a graph object
I would be glad for help!
When you run without the projection the issue is at:
bipartite_graph <- graph.incidence(bipartite_list)
You need to reshape 'bipartite_list' before applying into graph.incidence() function. Use the below command
tab <- table(bipartite_list)
and rest of the steps are same
g <- graph.incidence(tab,mode=c("all"))
fc <- fastgreedy.community(g)