Unable to load csv file through logstash - csv

I am new to ELK and I am trying to load a locally stored .csv file through Logstash so that I can use it with Elasticsearch.
The logstash config file looks likes this:
input {
file {
path => "C:\ELK-Stack\Cars Data Set\cars.csv"
start_position => "beginning"
sincedb_path => "/dev/null"
}
}
filter {
csv {
separator =>","
columns => ["maker","model","mileage","manufacture-year","engine_displacement","engine_power","body_type", "color_slug","stk_year","transmission","door_count","seat_count","fuel_type","date_created","date_last_seen", "price_eur"]
}
mutate {convert => ["mileage", "integer"]}
mutate {convert => ["price_eur", "float"]}
mutate {convert => ["door_count", "integer"]}
mutate {convert => ["engine_power", "integer"]}
mutate {convert => ["seat_count", "integer"]}
}
output {
elasticsearch {
hosts => ["localhost:9200"]}
index => "cars"
document_type => "sold_cars"
}
stdout {}
}
And the path of the file is: C:\ELK-Stack\Cars Data Set\cars.csv
I get an output that looks like this:
The .csv file has well over a million rows. Any help would be appreciated.
EDIT:
Now I am working on another dataset and unable to load it through logstash.
input {
file {
path => "C:\ELK-Stack\311.csv"
start_position => "beginning"
sincedb_path => "NUL"
}
}
filter {
csv {
separator =>","
columns => ["Unique Key","Created Date","Closed Date","Agency","Agency Name","Complaint Type","Descriptor", "Location Type","Incident Zip","Incident Address","Street Name","Cross Street 1","Cross Street 2","Intersection Street 1","Intersection Street 2", "Address Type", "City", "Landmark", "Facility Type", "Status", "Due Date", "Resolution Description", "Resolution Action Updated Date", "Community Board", "BBL", "Borough", "X Coordinate (State Plane)", "Y Coordinate (State Plane)", "Open Data Channel Type", "Park Facility Name", "Park Borough", "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location", "Bridge Highway Name", "Bridge Highway Segment", "Latitude", "Longitude", "Location"]
}
mutate {convert => ["Unique Key", "integer"]}
mutate {convert => ["Created Date", "timestamp"]}
mutate {convert => ["Closed Date", "timestamp"]}
mutate {convert => ["Due Date", "timestamp"]}
mutate {convert => ["Resolution Action Updated Date", "timestamp"]}
mutate {convert => ["X Coordinate (State Plane)", "integer"]}
mutate {convert => ["X Coordinate (State Plane)", "integer"]}
mutate {convert => ["Latitude", "integer"]}
mutate {convert => ["Longitude", "integer"]}
mutate {convert => ["Location", "integer"]}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "311"
}
stdout {}
}
Any ideas what could be wrong?

You have two errors in your configuration, the first one is a typo in your output block, a closing curly bracket in the hosts line, this is described in the error log.
exception => "LogStash:ConfigurationError"
The wrong line is this one: hosts => ["localhost:9200"]}
This is the fixed configuration
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "cars"
}
stdout {}
}
And since you are running Logstash 7.5, The document_type option was removed from version 7.0.
The second error is in your input block, you should use forward slashes even when running windows, but the sincedb_path directing to /dev/null/ is a Linux/macOS configuration, on Windows you should use NUL.
This is the correct configuration
input {
file {
path => "C:/ELK-Stack/Cars Data Set/cars.csv"
start_position => "beginning"
sincedb_path => "NUL"
}
}

Related

Parsing nested JSON log file into ELK - Shodan.io Logs

I'm trying to parse the nested JSON log file(Shodan.io)
I have parsed few values. Not able to parse below mentioned values:
hostnames
smb
smb_version
shares
temporary
type
name
comments
anonymous
transport
It will be good if can rid of value of 'raw':[0000, 0000]
You can check my sample log file here.
Below is my existing logstash filter configuration:
input {
file {
path => [ "/path/to/shodan-logs.json" ]
start_position => "beginning"
sincedb_path => "/dev/null"
}
}
filter {
json {
source => "message"
target => "json_parse"
add_tag => ["json_filter"]
tag_on_failure => ["json"]
}
grok {
break_on_match => false
add_tag => ["filtered"]
tag_on_failure => ["no_match_found"]
match => {
"message" => [
"%{IP:client_ip}",
"%{TIMESTAMP_ISO8601:timestamp}"
]
}
}
geoip {
source => "client_ip"
add_tag => ["geo_ip_found"]
tag_on_failure => ["geo_ip_not_found"]
}
useragent {
source => "message"
add_tag => ["user_details_found"]
}
# ruby {
# add_tag => ["ruby_filter"]
# code => '
# props = event.get("message")
# if props
# props.each { |x|
# key = x["key"]
# event.set("message.#{key}", x["value"])
# }
# end
# '
# }
mutate {
remove_field => [ "#timestamp", "path", "host", "#version" ]
}
}
output {
elasticsearch {
hosts => ["http://localhost:9200"]
user => "elastic"
password => "password"
index => "shodan-demo-%{+dd-MM-YYYY}"
}
stdout {
codec => rubydebug
}
}
Here are snapshots of my output on ELK
Note: I have tried below mentioned methods/filters
Commented ruby code filter, and it is not working.
Multiline input
json codec in input

How to create grok/json filter to parse the below json format

I want to parse this JSON to Kibana using Logstash
{
"Format": "IDEA0",
"ID": "2b03eb1f-fc4c-4f67-94e5-31c9fb32dccc",
"DetectTime": "2022-01-31T08:16:12.600470+07:00",
"EventTime": "2022-01-31T01:23:01.637438+00:00",
"Category": ['Intrusion.Botnet'],
"Confidence": 0.03,
"Note": "C&C channel, destination IP: 192.168.1.24 port: 8007/tcp score: 0.9324",
"Source": [{'IP4': ['192.168.1.25'], 'Type': ['CC']}]
}
I want that ID, Detect Time, Event Time, Category, Confidence, Note, Source is a single field so later i can do visualization in kibana.
Here's what I'm already trying to do
input {
file {
path => "/home/ubuntu/Downloads/StratosphereLinuxIPS/output/*.json"
start_position => "beginning"
sincedb_path => "/dev/null"
}
}
filter {
json {
source => "message"
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "test-test"
user => "***"
password => "***"
}
stdout{}
}
But the field is not separated correctly
Any help will be meaningful.
Thanks.
:::UPDATE:::
I already found the solution (Help by other guys from Elastic forum but not 100% optimize need to tweak it a little more)
Here's the Logstash Conf I'm using if someone needs it in the future
input {
file {
path => "/home/ubuntu/Downloads/StratosphereLinuxIPS/output/alerts.json"
start_position => "beginning"
sincedb_path => "/dev/null"
codec => multiline { pattern => "^{$" negate => "true" what => "previous" }
}
}
filter {
mutate {
gsub => ["message", "'", '"']
}
json {
source => "message"
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "test-keempat"
user => "xxx"
password => "xxx"
}
stdout{ codec => rubydebug }
}
Thanks !

How To Remove Quotation Marks (" ") In Geo Coordinates On Logstash Conf File

I want to express a line on a map in Kibana and the map should be Geojson structure.
The data that I have is a set of SQL table then I was about to transfer them to Elastic Search using Logstash like this
input{ ... }
filter{
if [lat] and [lon] {
mutate{convert => ["lat", "float"]}
mutate{convert => ["lon", "float"]}
mutate{convert => ["for_lat", "float"]}
mutate{convert => ["for_lon", "float"]}
mutate{
add_field => {"[location-geotest][type]" => "multilinestring"}
add_field => {"[location-geotest][coordinates]" => [["%{lon}", "%{lat}"]]}
add_field => {"[location-geotest][coordinates]" => [["%{for_lon}", "%{for_lat}"]]}
}
}
}
However the logstash conf file failed to index the data on Elasticsearch
{
:status=>400,
:action=>["index", {:_id=>"18022", :_index=>"geo_shape_test", :routing=>nil, :_type=>"_doc"}, #<LogStash::Event:0x687994b9>],
:response=> {
"index"=>{
"_index"=>"geo_shape_test",
"_type"=>"_doc",
"_id"=>"18022",
"status"=>400,
"error"=>{
"type"=>"mapper_parsing_exception",
"reason"=>"failed to parse field [location-geotest] of type [geo_shape]",
"caused_by"=>{"type"=>"x_content_parse_exception",
"reason"=>"[1:164] [geojson] failed to parse field [coordinates]",
"caused_by"=>{
"type"=>"parse_exception",
"reason"=>"geo coordinates must be numbers"
}
}
}
}
}
}
and this is the one of what logstash tried to index
{
"lat" => 37.567179953757886,
"gps_id" => 10491,
"timestamp" => 2020-11-22T06:10:45.000Z,
"speed" => 17.25745240090587,
"lon" => 126.99598717854032,
"for_lat" => 37.567179953757886,
"#timestamp" => 2020-11-27T03:54:21.131Z,
"for_lon" => 126.99598717854032,
"#version" => "1",
"location-geotest" => {
"coordinates" => [
[0] "[\"126.99598717854032\", \"37.567179953757886\"]",
[1] "[\"126.99598717854032\", \"37.567179953757886\"]"
],
"type" => "multilinestring"
}
}
I think the problem is...
"coordinates" => [
[0] "[\"126.99598717854032\", \"37.567179953757886\"]",
[1] "[\"126.99598717854032\", \"37.567179953757886\"]"
],
if I change the part, it will be...
"coordinates" => [
[0] [126.99598717854032, 37.567179953757886],
[1] [126.99598717854032, 37.567179953757886]
],
But I can't find how to solve.
I think the problem is as you say, the coordinates has to be float instead of strings. Apparently the mutate function converts the value back to string. As mentioned in
https://discuss.elastic.co/t/logstash-mutate-filter-always-stringifies-hash-and-array/25917
They suggest the solution to use a ruby script instead.
This has been done for the linestring as.
https://discuss.elastic.co/t/geo-shape-geo-link-problems-with-coordinates/179924/4
From the data provided I don't see why you need multiline string? With only two points it should be enough to store as line string.
I tried it out with
filter{
if [lat] and [lon] {
mutate{
convert => ["lat", "float"]
convert => ["lon", "float"]
convert => ["for_lat", "float"]
convert => ["for_lon", "float"]
add_field => {"[location-geotest][type]" => "linestring"}
}
ruby{
code => "event.set('[location-geotest][coordinates]', [[event.get('lon'), event.get('lat')], [event.get('for_lon'), event.get('for_lat')]])"
}
}
}
and get the result:
"location-geotest" => {
"type" => "linestring",
"coordinates" => [
[0] [
[0] 126.99598717854032,
[1] 37.567179953757886
],
[1] [
[0] 126.99598717854032,
[1] 37.567179953757886
]
]
}
Which is indexed correctly.
If you need multi string I guess you need more data and add one more layer of arrays in the ruby script.

Can't able to import json data from file by Logstash

I'm trying to import JSON data from my log file mylogs.log. Following is my logstash config file.
input {
stdin { }
file {
codec => "json"
path => "/logs/mylogs.log"
start_position => "beginning"
sincedb_path => "/dev/null"
}
}
filter{
json{
source => "message"
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "jsonlog"
}
stdout { codec => rubydebug }
file {
path => "/logs/out.log"
}
}
After executing this config file, if I passed any JSON data it's getting parse & sending to Elasticsearch instance. I can see from Elasticsearch instance. But, whatever data exist in log file those not get imported by logstash.
Also, when I manually adding JSON data which getting parse by Logstash & send Elasticsearch instance... those data also not gettign logged in my OUTPUT file.
Don't know what is the issue.
My sample JSON data which I'm using.
{ "logger":"com.myApp.ClassName", "timestamp":"1456976539634", "level":"ERROR", "thread":"pool-3-thread-19", "message":"Danger. There was an error", "throwable":"java.Exception" }
{ "logger":"com.myApp.ClassName", "timestamp":"1456976539649", "level":"ERROR", "thread":"pool-3-thread-16", "message":"I cannot go on", "throwable":"java.Exception" }
OK, after making following modification file path plugin in Lagstash config file it's working now.
input {
stdin { }
file {
codec => "json"
path => "/home/suresh/Desktop/tools/logstash-5.1.1/logs/mylogs.log"
start_position => "beginning"
sincedb_path => "/dev/null"
}
}
filter{
json{
source => "message"
}
}
output {
elasticsearch {
hosts => ["localhost:9200"]
index => "jsonlog2"
}
stdout { codec => rubydebug }
file {
path => "/home/suresh/Desktop/tools/logstash-5.1.1/logs/out.log"
}
}
But, I'm getting an error of "tags" => [
[0] "_jsonparsefailure"
]
Response from Console-
{
"path" => "/home/suresh/Desktop/tools/logstash-5.1.1/logs/mylogs.log",
"#timestamp" => 2016-12-27T09:56:08.854Z,
"level" => "ERROR",
"logger" => "com.myApp.ClassName",
"throwable" => "java.Exception",
"#version" => "1",
"host" => "BLR-SOFT-245",
"thread" => "pool-3-thread-19",
"message" => "Danger. There was an error",
"timestamp" => "1456976539634",
"tags" => [
[0] "_jsonparsefailure"
]
}

Logstash doesn't process all input files

I'm using logstash to process several Zanox csv exports and export them to elasticsearch.
However, for some reason logstash often processes only some of the input files.
The input files definitely exist in the given directory. To avoid the Logstash Inode buch I've set sincedb_path to /dev/null, stop logstash every day before the new files are downloaded and start it after the downloads have been completed.
Logstash and Elasticsearch are currently on the same server
The first file (1) is the only one that always is being imported. It is a rather big feed with about half a gigabyte.
Also the Zanox csv has a small glitch: the first line starts with a point, invalidating the csv format for this line.
input {
file {
path => ["/var/app/1/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "1"
}
file {
path => ["/var/app/2/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "2"
}
file {
path => ["/var/app/3/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "3"
}
file {
path => ["/var/app/4/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "4"
}
file {
path => ["/var/app/5/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "5"
}
file {
path => ["/var/app/6/*.csv"]
sincedb_path => "/dev/null"
start_position => beginning
type => "6"
}
}
filter {
if [type] == "1" {
csv {
columns => [ "title", "price", "image", "deeplink", "color" ]
separator => ";"
}
} else {
csv {
columns => [ "title", "price", "image", "deeplink" ]
separator => ";"
}
}
mutate {
convert => ["price", "float"]
add_field => {"source" => "%{type}"}
}
if ![title] {
drop { }
}
}
output {
elasticsearch{
index => products
index_type => products
host => localhost
document_id => "%{deeplink}"
flush_size => 5000
}
}
What could be the reason for logstash not processing all the files?
EDIT:
Removed CSV processing errors by doing some pre-processing. Now I got the following error in the logstash log:
log4j, [2014-09-11T03:41:50.075] WARN: org.elasticsearch.monitor.jvm:
[logstash-17675126.onlinehome-server.info-13551-4016] [gc][young][1345505][23190] duration [2.4s],
collections [1]/[2.8s],
total [2.4s]/[10.3m],
memory [298.8mb]->[122.3mb]/[483.3mb],
all_pools {
[young] [1.9mb]->[1.2mb]/[133.3mb]
}
{
[survivor] [16.6mb]->[0b]/[16.6mb]}{[old] [280.2mb]->[121.1mb]/[333.3mb]
}