Convert xml to json in Jenkinsfile - json

I have a problem with a method in my Jenkinsfile when i tried to convert xml to json. That is the method, and the pipeline.
I tried to pass the method directly to the echo, but it gives me an error and the pipeline fails
Sorry but i don't know that details i could give about the error, because i start to learning and this is the first time that i see this code.
ERROR: org.xml.sax.SAXParseException; lineNumber: 2; columnNumber: 1; Content is not allowed in prolog.
I edit my question and i add a bat in stage OWASP dependencies testing. This bat create automatically the xml, i put in a validator xml and this did not errors. So i don't know if the problem is with the code of Jenkinsfile or xml, because the error is the same. I put part of xml code because it's very long, but
the error is still in the second line.
XML Code:
<?xml version="1.0" encoding="UTF-8"?>
<analysis xmlns="https://jeremylong.github.io/DependencyCheck/dependency-check.2.2.xsd">
<scanInfo>
<engineVersion>5.2.2</engineVersion>
<dataSource>
<name>NVD CVE Checked</name>
<timestamp>2019-11-25T09:01:51</timestamp>
</dataSource>
<datasource>...</datasource>
</scanInfo>
....................
</analysis>
import groovy.json.*;
def getDependencyResumeFromXML(pathReport){
def xml = bat(script:'type ' + pathReport, returnStdout:true);
def x = new XmlParser().parseText(xml);
def nDep = x.dependencies.dependency.size();
def dependencies = [:];
for(def i=0;i<nDep;i++){
dependencies[i] = [fileName: x.dependencies.dependency[i].fileName.text(),description:x.dependencies.dependency[i].description.text(),vulnerabilities:[:]];
def nVul = x.dependencies.dependency[i].vulnerabilities.vulnerability.size();
for(def j=0;j<nVul;j++){
dependencies[i].vulnerabilities[j] = [
name:x.dependencies.dependency[i].vulnerabilities.vulnerability[j].name.text(), cvssScore:x.dependencies.dependency[i].vulnerabilities.vulnerability[j].cvssScore.text(),
severity:x.dependencies.dependency[i].vulnerabilities.vulnerability[j].severity.text(),
cwe:x.dependencies.dependency[i].vulnerabilities.vulnerability[j].cwe.text(),
description:x.dependencies.dependency[i].vulnerabilities.vulnerability[j].description.text(),
];
}
}
return dependencies;
}
pipeline{
.......
stages{
stage('OWASP dependencies testing'){
steps{
script{
bat 'mvn org.owasp:dependency-check-maven:check';
def pathReport = 'C:\\tmp\\workspace\\umbrella-pipeline-prueba\\target\\dependency-check\\dependency-check-report.xml';
def xml = bat(script:'type ' + pathReport, returnStdout:true);
echo '------------------ 1';
echo xml;
echo '------------------ 2';
echo '--------------------------------'
def dependencias = getDependencyResumeFromXML(pathReport);
echo '------------- 3';
echo dependencias;
echo '------------- 4';
}
}
}
}
}

Related

Jenkins/groovy: How to pretty-print a net.sf.json.JSONObject with null?

Working on a Jenkins pipeline, I observed what looks like infinite recursion causing a stack overflow when I use JsonOutput.toJson() on a net.sf.json.JSONObject that slurped a JSON string containing null.
The following minimal code demonstrates the problem:
// Jenkinsfile
#Library('libs#dev') libs
import groovy.json.JsonOutput
pipeline {
agent any
stages {
stage( "json" ) {
steps {
script {
my_lib.to_json_handbuilt_linkedhashmap()
my_lib.to_json_readjson()
my_lib.to_json_readjson_as_linkedhashmap()
}
}
}
}
}
// vars/my_lib.groovy
import groovy.json.JsonOutput
def asMap(j) {
return j as LinkedHashMap
}
// This function is successful.
def to_json_handbuilt_linkedhashmap() {
def d = [:]
d.issues = null
echo "---- handmade LinkedHashMap ----"
echo "d ${d}"
echo "d.getClass() ${d.getClass()}"
echo "JsonOutput.toJson(d) ${JsonOutput.toJson(d)}"
}
// This function fails from infinite recursion causing a stack overflow.
def to_json_readjson() {
def d = readJSON(text: '{ "issues" : null }')
echo "---- readJSON ----"
echo "d ${d}"
echo "d.getClass() ${d.getClass()}"
echo "JsonOutput.toJson(d) ${JsonOutput.toJson(d)}"
}
// This function also fails from infinite recursion causing a stack overflow.
def to_json_readjson_as_linkedhashmap() {
def d = asMap(readJSON(text: '{ "issues" : null }'))
echo "---- readJSON -> asMap ----"
echo "d ${d}"
echo "d.getClass() ${d.getClass()}"
echo "JsonOutput.toJson(d) ${JsonOutput.toJson(d)}"
}
In the code above, to_json_readjson() fails with a stack overflow when JsonOutput.toJson() is called with the net.sf.json.JSONObject returned by readJSON(text: '{ "issues" : null }').
The Jenkins console output is at the end of this post.
In to_json_handbuilt_linkedhashmap() JsonOutput.toJson() is successful when called with a handcrafted LinkedHashMap equivalent to { "issues" : null }.
Lastly, in to_json_readjson_as_linkedhashmap(), JsonOutput.toJson() again fails with a stack overflow when called with a LinkedHashMap created from a net.sf.json.JSONObject.
Question:
Can someone please explain what's causing the stack overflow when readJSON() and/or JsonOutput.toJson() are used with a JSON string that has null?
Because my handcrafted LinkedHashMap was successful with JsonOutput.toJson(), I thought the problem was passing JsonOutput.toJson() a net.sf.json.JSONObject.
But I think that theory is ruled out because in to_json_readjson_as_linkedhashmap(), I give JsonOutput.toJson() a LinkedHashMap, albeit created from a net.sf.json.JSONObject.
The problem would appear to be some combination of readJSON() and/or JsonOutput.toJson() that I'm failing to grasp.
I tried, but have given up trying to use a JsonSlurper, because I'm unable to even create an instance of one.
The (truncated) stack overflow error likely showing infinite recursion:
Posting build status of FAILED to bitbucket.company.comjava.lang.StackOverflowError
at java.io.PrintStream.flush(PrintStream.java:338)
at sun.nio.cs.StreamEncoder.implFlush(StreamEncoder.java:297)
at sun.nio.cs.StreamEncoder.flush(StreamEncoder.java:141)
at java.io.OutputStreamWriter.flush(OutputStreamWriter.java:229)
at java.util.logging.StreamHandler.flush(StreamHandler.java:259)
at java.util.logging.ConsoleHandler.publish(ConsoleHandler.java:117)
at java.util.logging.Logger.log(Logger.java:738)
at java.util.logging.Logger.doLog(Logger.java:765)
at java.util.logging.Logger.throwing(Logger.java:1447)
at org.codehaus.groovy.runtime.DefaultGroovyMethods.getProperties(DefaultGroovyMethods.java:391)
at groovy.json.JsonOutput.getObjectProperties(JsonOutput.java:327)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:320)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
at groovy.json.JsonOutput.writeMap(JsonOutput.java:458)
at groovy.json.JsonOutput.writeObject(JsonOutput.java:321)
Can you sidestep this immediate problem by using readJSON's returnPojo: true parameter, thereby solving your overall task sooner?
Getting plain old nulls rather than net.sf.json.JSONNull objects really helped me today, though my problem involved producing CSV rather than using JsonOutput.

AWS SAM CLI DEPLOYMENT - Issues Parsing Out of the Box secret = get_secret_value_response['SecretString']

I am very green when it comes to Python - so bare with me.
Using the out of the Box Python3 Sample Code provided by AWS to return the 'SecretString' from the AWS: Secrets Manager Service.
No issues there .. I get the returned object (note I have blanked out some details)
{"username":"postgres","password":"XXXXXXXXX","engine":"postgres","host":"srdataset.XXXXXXXXX.ap-southeast-2.rds.amazonaws.com","port":5432,"dbInstanceIdentifier":"srdataset"}
detail are all correct.
I am then using json.loads() to parse the above into my next function so I can extract the details like so
# request details
login_details = get_secret("pg_srdataset_login_details")
# load json
y = json.loads(login_details)
# extract result is a Python dictionary:
print(y["username"])
This again all works fine in my IDE (PyCharm). I can run the code, build in a Docker container .. and I then use the PyCharm AWS SAM CLI to deploy the code to the cloud .. no issues.
However when I test the function in AWS the code bugs out on the line y = json.loads(login_details) step.
the error being ..
{
"errorMessage": "Expecting value: line 1 column 1 (char 0)",
"errorType": "JSONDecodeError",
"stackTrace": [
" File \"/var/task/update_sp_changes.py\", line 229, in lambda_handler\n y = json.loads(login_details)\n",
" File \"/var/lang/lib/python3.8/json/__init__.py\", line 357, in loads\n return _default_decoder.decode(s)\n",
" File \"/var/lang/lib/python3.8/json/decoder.py\", line 337, in decode\n obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n",
" File \"/var/lang/lib/python3.8/json/decoder.py\", line 355, in raw_decode\n raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n"
]
}
To test this I also copied the JSON 'SecretString' returned from AWS, hard coded it as a variable, then passed this variable directly into the y = json.loads(login_details) step. Tested again and works a treat.
What am I doing wrong - how can I work around this issue.
For reference, I'd share Lambda print debug code.
Preparation:
from the Lambda-py38 initial state, SecretsManagerReadWrite policy is attached to Lamda
set SecretId and VersionId
Output:
json.loads() works for plaintext
print() output almost the same for plain_text:str and secret_dict:dict. (Difference is quotation:single/double. Which are treated as the same string when copy-pasted.)
import json
import boto3
def lambda_handler(event, context):
client = boto3.client('secretsmanager')
response = client.get_secret_value(
SecretId='arn:aws:secretsmanager:xxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
VersionId='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
VersionStage='AWSCURRENT'
)
print("type of response: ", type(response))
print("response: ", response)
plaintext = response['SecretString']
print("type of plaintext: ", type(plaintext))
print("plaintext: ", plaintext)
secret_dict = json.loads(plaintext)
print("type of secret_dict: ", type(secret_dict))
print("secret_dict: ", secret_dict)
return 200

ruby sketchup scene serialization

I am very new on Sketchup and ruby , I have worked with java and c# but this is the first time with ruby.
Now I have one problem, I need to serialize all scene in one json (scene hierarchy, object name, object material and position this for single object) how can I do this?
I have already done this for unity3D (c#) without a problem.
I tried this:
def main
avr_entities = Sketchup.active_model.entities # all objects
ambiens_dictionary = {}
ambiens_list = []
avr_entities.each do |root|
if root.is_a?(Sketchup::Group) || root.is_a?(Sketchup::ComponentInstance)
if root.name == ""
UI.messagebox("this is a group #{root.definition.name}")
if root.entities.count > 0
root.entities.each do |leaf|
if leaf.is_a?(Sketchup::Group) || leaf.is_a?(Sketchup::ComponentInstance)
UI.messagebox("this is a leaf #{leaf.definition.name}")
end
end
end
else
# UI.messagebox("this is a leaf #{root.name}")
end
end
end
end
Have you tried the JSON library
require 'json'
source = { a: [ { b: "hello" }, 1, "world" ], c: 'hi' }.to_json
source.to_json # => "{\"a\":[{\"b\":\"hello\"},1,\"world\"],\"c\":\"hi\"}"
Used the code below to answer a question Here, but it might also work here.
The code can run outside of SketchUp for testing in the terminal. Just make sure to follow these steps...
Copy the code below and paste it on a ruby file (example: file.rb)
Run the script in terminal ruby file.rb.
The script will write data to JSON file and also read the content of JSON file.
The path to the JSON file is relative to the ruby file created in step one. If the script can't find the path it will create the JSON file for you.
module DeveloperName
module PluginName
require 'json'
require 'fileutils'
class Main
def initialize
path = File.dirname(__FILE__)
#json = File.join(path, 'file.json')
#content = { 'hello' => 'hello world' }.to_json
json_create(#content)
json_read(#json)
end
def json_create(content)
File.open(#json, 'w') { |f| f.write(content) }
end
def json_read(json)
if File.exist?(json)
file = File.read(json)
data_hash = JSON.parse(file)
puts "Json content: #{data_hash}"
else
msg = 'JSON file not found'
UI.messagebox(msg, MB_OK)
end
end
# # #
end
DeveloperName::PluginName::Main.new
end
end

error in json in corona lua

hi i have found a tutorial on how to use post json in lua.
here is the code :
http = require("socket.http")
crypto = require("crypto")
ltn12 = require("ltn12")
url = require("socket.url")
local json = require("json")
local commands_json =
{
["message"] = "Hello",
}
print (commands_json)
local json = {}
json.api_key = "6_192116334"
json.ver = 1
json.commands_json = json.encode(commands_json)
json.commands_hash = crypto.digest(crypto.md5, json.commands_json .. 'hkjhkjhkjh')
local post = "api=" .. url.escape(Json.Encode(json))
local response = {}
local r, c, h = http.request {
url = "http://127.0.0.1/?page=api",
method = "POST",
headers = {
["content-length"] = #post,
["Content-Type"] = "application/x-www-form-urlencoded"
},
source = ltn12.source.string(post),
sink = ltn12.sink.table(response)
}
local path = system.pathForFile("r.txt", system.DocumentsDirectory)
local file = io.open (path, "w")
file:write (Json.Encode(json) .. "\n")
file:write (post .. "\n")
file:write (response[1] .. "\n")
io.close (file)
json = Json.Decode(table.concat(response,''))
native.showAlert("hey", json.commands[1].tot_nbr_rows)
now i got these error:
Windows simulator build date: Dec 9 2011 # 14:01:29
Copyright (C) 2009-2011 A n s c a , I n c .
Version: 2.0.0
Build: 2011.704
table: 0346D6D0
Runtime error
...nistrator\my documents\corona projects\json\main.lua:17: attempt to c
all field 'encode' (a nil value)
stack traceback:
[C]: in function 'encode'
...nistrator\my documents\corona projects\json\main.lua:17: in main chun
k
Runtime error: ...nistrator\my documents\corona projects\json\main.lua:17: attem
pt to call field 'encode' (a nil value)
stack traceback:
[C]: in function 'encode'
...nistrator\my documents\corona projects\json\main.lua:17: in main chun
k
i don't know why i got the error from encode.
can anyone can help me about my case?
thanks in advance ...
This includes the Json code provided externally, likely with an encode function:
local json = require("json")
This throws away your old json variable and replaces it with an empty table:
local json = {}
And this tries to call json.encode which is now undefined since you redefined json as an empty table above:
json.commands_json = json.encode(commands_json)
The solution is to pick a different variable name.

Can you provide examples of parsing HTML?

How do you parse HTML with a variety of languages and parsing libraries?
When answering:
Individual comments will be linked to in answers to questions about how to parse HTML with regexes as a way of showing the right way to do things.
For the sake of consistency, I ask that the example be parsing an HTML file for the href in anchor tags. To make it easy to search this question, I ask that you follow this format
Language: [language name]
Library: [library name]
[example code]
Please make the library a link to the documentation for the library. If you want to provide an example other than extracting links, please also include:
Purpose: [what the parse does]
Language: JavaScript
Library: jQuery
$.each($('a[href]'), function(){
console.debug(this.href);
});
(using firebug console.debug for output...)
And loading any html page:
$.get('http://stackoverflow.com/', function(page){
$(page).find('a[href]').each(function(){
console.debug(this.href);
});
});
Used another each function for this one, I think it's cleaner when chaining methods.
Language: C#
Library: HtmlAgilityPack
class Program
{
static void Main(string[] args)
{
var web = new HtmlWeb();
var doc = web.Load("http://www.stackoverflow.com");
var nodes = doc.DocumentNode.SelectNodes("//a[#href]");
foreach (var node in nodes)
{
Console.WriteLine(node.InnerHtml);
}
}
}
language: Python
library: BeautifulSoup
from BeautifulSoup import BeautifulSoup
html = "<html><body>"
for link in ("foo", "bar", "baz"):
html += '%s' % (link, link)
html += "</body></html>"
soup = BeautifulSoup(html)
links = soup.findAll('a', href=True) # find <a> with a defined href attribute
print links
output:
[foo,
bar,
baz]
also possible:
for link in links:
print link['href']
output:
http://foo.com
http://bar.com
http://baz.com
Language: Perl
Library: pQuery
use strict;
use warnings;
use pQuery;
my $html = join '',
"<html><body>",
(map { qq($_) } qw/foo bar baz/),
"</body></html>";
pQuery( $html )->find( 'a' )->each(
sub {
my $at = $_->getAttribute( 'href' );
print "$at\n" if defined $at;
}
);
language: shell
library: lynx (well, it's not library, but in shell, every program is kind-of library)
lynx -dump -listonly http://news.google.com/
language: Ruby
library: Hpricot
#!/usr/bin/ruby
require 'hpricot'
html = '<html><body>'
['foo', 'bar', 'baz'].each {|link| html += "#{link}" }
html += '</body></html>'
doc = Hpricot(html)
doc.search('//a').each {|elm| puts elm.attributes['href'] }
language: Python
library: HTMLParser
#!/usr/bin/python
from HTMLParser import HTMLParser
class FindLinks(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
at = dict(attrs)
if tag == 'a' and 'href' in at:
print at['href']
find = FindLinks()
html = "<html><body>"
for link in ("foo", "bar", "baz"):
html += '%s' % (link, link)
html += "</body></html>"
find.feed(html)
language: Perl
library: HTML::Parser
#!/usr/bin/perl
use strict;
use warnings;
use HTML::Parser;
my $find_links = HTML::Parser->new(
start_h => [
sub {
my ($tag, $attr) = #_;
if ($tag eq 'a' and exists $attr->{href}) {
print "$attr->{href}\n";
}
},
"tag, attr"
]
);
my $html = join '',
"<html><body>",
(map { qq($_) } qw/foo bar baz/),
"</body></html>";
$find_links->parse($html);
Language Perl
Library: HTML::LinkExtor
Beauty of Perl is that you have modules for very specific tasks. Like link extraction.
Whole program:
#!/usr/bin/perl -w
use strict;
use HTML::LinkExtor;
use LWP::Simple;
my $url = 'http://www.google.com/';
my $content = get( $url );
my $p = HTML::LinkExtor->new( \&process_link, $url, );
$p->parse( $content );
exit;
sub process_link {
my ( $tag, %attr ) = #_;
return unless $tag eq 'a';
return unless defined $attr{ 'href' };
print "- $attr{'href'}\n";
return;
}
Explanation:
use strict - turns on "strict" mode -
eases potential debugging, not fully
relevant to the example
use HTML::LinkExtor - load of interesting module
use LWP::Simple - just a simple way to get some html for tests
my $url = 'http://www.google.com/' - which page we will be extracting urls from
my $content = get( $url ) - fetches page html
my $p = HTML::LinkExtor->new( \&process_link, $url ) - creates LinkExtor object, givin it reference to function that will be used as callback on every url, and $url to use as BASEURL for relative urls
$p->parse( $content ) - pretty obvious I guess
exit - end of program
sub process_link - begin of function process_link
my ($tag, %attr) - get arguments, which are tag name, and its atributes
return unless $tag eq 'a' - skip processing if the tag is not <a>
return unless defeined $attr{'href'} - skip processing if the <a> tag doesn't have href attribute
print "- $attr{'href'}\n"; - pretty obvious I guess :)
return; - finish the function
That's all.
Language: Ruby
Library: Nokogiri
#!/usr/bin/env ruby
require 'nokogiri'
require 'open-uri'
document = Nokogiri::HTML(open("http://google.com"))
document.css("html head title").first.content
=> "Google"
document.xpath("//title").first.content
=> "Google"
Language: Common Lisp
Library: Closure Html, Closure Xml, CL-WHO
(shown using DOM API, without using XPATH or STP API)
(defvar *html*
(who:with-html-output-to-string (stream)
(:html
(:body (loop
for site in (list "foo" "bar" "baz")
do (who:htm (:a :href (format nil "http://~A.com/" site))))))))
(defvar *dom*
(chtml:parse *html* (cxml-dom:make-dom-builder)))
(loop
for tag across (dom:get-elements-by-tag-name *dom* "a")
collect (dom:get-attribute tag "href"))
=>
("http://foo.com/" "http://bar.com/" "http://baz.com/")
Language: Clojure
Library: Enlive (a selector-based (à la CSS) templating and transformation system for Clojure)
Selector expression:
(def test-select
(html/select (html/html-resource (java.io.StringReader. test-html)) [:a]))
Now we can do the following at the REPL (I've added line breaks in test-select):
user> test-select
({:tag :a, :attrs {:href "http://foo.com/"}, :content ["foo"]}
{:tag :a, :attrs {:href "http://bar.com/"}, :content ["bar"]}
{:tag :a, :attrs {:href "http://baz.com/"}, :content ["baz"]})
user> (map #(get-in % [:attrs :href]) test-select)
("http://foo.com/" "http://bar.com/" "http://baz.com/")
You'll need the following to try it out:
Preamble:
(require '[net.cgrand.enlive-html :as html])
Test HTML:
(def test-html
(apply str (concat ["<html><body>"]
(for [link ["foo" "bar" "baz"]]
(str "" link ""))
["</body></html>"])))
language: Perl
library: XML::Twig
#!/usr/bin/perl
use strict;
use warnings;
use Encode ':all';
use LWP::Simple;
use XML::Twig;
#my $url = 'http://stackoverflow.com/questions/773340/can-you-provide-an-example-of-parsing-html-with-your-favorite-parser';
my $url = 'http://www.google.com';
my $content = get($url);
die "Couldn't fetch!" unless defined $content;
my $twig = XML::Twig->new();
$twig->parse_html($content);
my #hrefs = map {
$_->att('href');
} $twig->get_xpath('//*[#href]');
print "$_\n" for #hrefs;
caveat: Can get wide-character errors with pages like this one (changing the url to the one commented out will get this error), but the HTML::Parser solution above doesn't share this problem.
Language: Perl
Library: HTML::Parser
Purpose: How can I remove unused, nested HTML span tags with a Perl regex?
Language: Java
Libraries: XOM, TagSoup
I've included intentionally malformed and inconsistent XML in this sample.
import java.io.IOException;
import nu.xom.Builder;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Nodes;
import nu.xom.ParsingException;
import nu.xom.ValidityException;
import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.SAXException;
public class HtmlTest {
public static void main(final String[] args) throws SAXException, ValidityException, ParsingException, IOException {
final Parser parser = new Parser();
parser.setFeature(Parser.namespacesFeature, false);
final Builder builder = new Builder(parser);
final Document document = builder.build("<html><body><ul><li>google</li><li><a HREF=\"http://reddit.org\" target=\"_blank\">reddit</li><li><a name=\"nothing\">nothing</a><li></ul></body></html>", null);
final Element root = document.getRootElement();
final Nodes links = root.query("//a[#href]");
for (int linkNumber = 0; linkNumber < links.size(); ++linkNumber) {
final Node node = links.get(linkNumber);
System.out.println(((Element) node).getAttributeValue("href"));
}
}
}
TagSoup adds an XML namespace referencing XHTML to the document by default. I've chosen to suppress that in this sample. Using the default behavior would require the call to root.query to include a namespace like so:
root.query("//xhtml:a[#href]", new nu.xom.XPathContext("xhtml", root.getNamespaceURI())
Language: C#
Library: System.XML (standard .NET)
using System.Collections.Generic;
using System.Xml;
public static void Main(string[] args)
{
List<string> matches = new List<string>();
XmlDocument xd = new XmlDocument();
xd.LoadXml("<html>...</html>");
FindHrefs(xd.FirstChild, matches);
}
static void FindHrefs(XmlNode xn, List<string> matches)
{
if (xn.Attributes != null && xn.Attributes["href"] != null)
matches.Add(xn.Attributes["href"].InnerXml);
foreach (XmlNode child in xn.ChildNodes)
FindHrefs(child, matches);
}
Language: PHP
Library: SimpleXML (and DOM)
<?php
$page = new DOMDocument();
$page->strictErrorChecking = false;
$page->loadHTMLFile('http://stackoverflow.com/questions/773340');
$xml = simplexml_import_dom($page);
$links = $xml->xpath('//a[#href]');
foreach($links as $link)
echo $link['href']."\n";
Language: JavaScript
Library: DOM
var links = document.links;
for(var i in links){
var href = links[i].href;
if(href != null) console.debug(href);
}
(using firebug console.debug for output...)
Language: Racket
Library: (planet ashinn/html-parser:1) and (planet clements/sxml2:1)
(require net/url
(planet ashinn/html-parser:1)
(planet clements/sxml2:1))
(define the-url (string->url "http://stackoverflow.com/"))
(define doc (call/input-url the-url get-pure-port html->sxml))
(define links ((sxpath "//a/#href/text()") doc))
Above example using packages from the new package system: html-parsing and sxml
(require net/url
html-parsing
sxml)
(define the-url (string->url "http://stackoverflow.com/"))
(define doc (call/input-url the-url get-pure-port html->xexp))
(define links ((sxpath "//a/#href/text()") doc))
Note: Install the required packages with 'raco' from a command line, with:
raco pkg install html-parsing
and:
raco pkg install sxml
language: Python
library: lxml.html
import lxml.html
html = "<html><body>"
for link in ("foo", "bar", "baz"):
html += '%s' % (link, link)
html += "</body></html>"
tree = lxml.html.document_fromstring(html)
for element, attribute, link, pos in tree.iterlinks():
if attribute == "href":
print link
lxml also has a CSS selector class for traversing the DOM, which can make using it very similar to using JQuery:
for a in tree.cssselect('a[href]'):
print a.get('href')
Language: Objective-C
Library: libxml2 + Matt Gallagher's libxml2 wrappers + Ben Copsey's ASIHTTPRequest
ASIHTTPRequest *request = [ASIHTTPRequest alloc] initWithURL:[NSURL URLWithString:#"http://stackoverflow.com/questions/773340"];
[request start];
NSError *error = [request error];
if (!error) {
NSData *response = [request responseData];
NSLog(#"Data: %#", [[self query:#"//a[#href]" withResponse:response] description]);
[request release];
}
else
#throw [NSException exceptionWithName:#"kMyHTTPRequestFailed" reason:#"Request failed!" userInfo:nil];
...
- (id) query:(NSString *)xpathQuery WithResponse:(NSData *)resp {
NSArray *nodes = PerformHTMLXPathQuery(resp, xpathQuery);
if (nodes != nil)
return nodes;
return nil;
}
Language: Perl
Library : HTML::TreeBuilder
use strict;
use HTML::TreeBuilder;
use LWP::Simple;
my $content = get 'http://www.stackoverflow.com';
my $document = HTML::TreeBuilder->new->parse($content)->eof;
for my $a ($document->find('a')) {
print $a->attr('href'), "\n" if $a->attr('href');
}
Language: PHP
Library: DOM
<?php
$doc = new DOMDocument();
$doc->strictErrorChecking = false;
$doc->loadHTMLFile('http://stackoverflow.com/questions/773340');
$xpath = new DOMXpath($doc);
$links = $xpath->query('//a[#href]');
for ($i = 0; $i < $links->length; $i++)
echo $links->item($i)->getAttribute('href'), "\n";
Sometimes it's useful to put # symbol before $doc->loadHTMLFile to suppress invalid html parsing warnings
Language: Python
Library: HTQL
import htql;
page="<a href=a.html>1</a><a href=b.html>2</a><a href=c.html>3</a>";
query="<a>:href,tx";
for url, text in htql.HTQL(page, query):
print url, text;
Simple and intuitive.
language: Ruby
library: Nokogiri
#!/usr/bin/env ruby
require "nokogiri"
require "open-uri"
doc = Nokogiri::HTML(open('http://www.example.com'))
hrefs = doc.search('a').map{ |n| n['href'] }
puts hrefs
Which outputs:
/
/domains/
/numbers/
/protocols/
/about/
/go/rfc2606
/about/
/about/presentations/
/about/performance/
/reports/
/domains/
/domains/root/
/domains/int/
/domains/arpa/
/domains/idn-tables/
/protocols/
/numbers/
/abuse/
http://www.icann.org/
mailto:iana#iana.org?subject=General%20website%20feedback
This is a minor spin on the one above, resulting in an output that is usable for a report. I only return the first and last elements in the list of hrefs:
#!/usr/bin/env ruby
require "nokogiri"
require "open-uri"
doc = Nokogiri::HTML(open('http://nokogiri.org'))
hrefs = doc.search('a[href]').map{ |n| n['href'] }
puts hrefs
.each_with_index # add an array index
.minmax{ |a,b| a.last <=> b.last } # find the first and last element
.map{ |h,i| '%3d %s' % [1 + i, h ] } # format the output
1 http://github.com/tenderlove/nokogiri
100 http://yokolet.blogspot.com
Language: Java
Library: jsoup
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.xml.sax.SAXException;
public class HtmlTest {
public static void main(final String[] args) throws SAXException, ValidityException, ParsingException, IOException {
final Document document = Jsoup.parse("<html><body><ul><li>google</li><li><a HREF=\"http://reddit.org\" target=\"_blank\">reddit</li><li><a name=\"nothing\">nothing</a><li></ul></body></html>");
final Elements links = document.select("a[href]");
for (final Element element : links) {
System.out.println(element.attr("href"));
}
}
}
Using phantomjs, save this file as extract-links.js:
var page = new WebPage(),
url = 'http://www.udacity.com';
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var results = page.evaluate(function() {
var list = document.querySelectorAll('a'), links = [], i;
for (i = 0; i < list.length; i++) {
links.push(list[i].href);
}
return links;
});
console.log(results.join('\n'));
}
phantom.exit();
});
run:
$ ../path/to/bin/phantomjs extract-links.js
Language: Coldfusion 9.0.1+
Library: jSoup
<cfscript>
function parseURL(required string url){
var res = [];
var javaLoader = createObject("javaloader.JavaLoader").init([expandPath("./jsoup-1.7.3.jar")]);
var jSoupClass = javaLoader.create("org.jsoup.Jsoup");
//var dom = jSoupClass.parse(html); // if you already have some html to parse.
var dom = jSoupClass.connect( arguments.url ).get();
var links = dom.select("a");
for(var a=1;a LT arrayLen(links);a++){
var s={};s.href= links[a].attr('href'); s.text= links[a].text();
if(s.href contains "http://" || s.href contains "https://") arrayAppend(res,s);
}
return res;
}
//writeoutput(writedump(parseURL(url)));
</cfscript>
<cfdump var="#parseURL("http://stackoverflow.com/questions/773340/can-you-provide-examples-of-parsing-html")#">
Returns an array of structures, each struct contains an HREF and TEXT objects.
Language: JavaScript/Node.js
Library: Request and Cheerio
var request = require('request');
var cheerio = require('cheerio');
var url = "https://news.ycombinator.com/";
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var anchorTags = $('a');
anchorTags.each(function(i,element){
console.log(element["attribs"]["href"]);
});
}
});
Request library downloads the html document and Cheerio lets you use jquery css selectors to target the html document.