What is the best practice to parse html in swift? - html

I'm a Swift newbie. I need for something like Python's BeautifulSoup in Swift iOS project. Precisely, I need to get all href of <a> that ends with ".txt". What are the steps that I should take?

There are several nice libraries of HTML Parsing using Swift and Objective-C like the followings:
hpple
NDHpple
Kanna( old Swift-HTML-Parser)
Fuzi
SwiftSoup
Ji
Take a look in the following examples in the four libraries posted above, mainly parsed using XPath 2.0:
hpple:
let data = NSData(contentsOfFile: path)
let doc = TFHpple(htmlData: data)
if let elements = doc.searchWithXPathQuery("//a/#href[ends-with(.,'.txt')]") as? [TFHppleElement] {
for element in elements {
println(element.content)
}
}
NDHpple:
let data = NSData(contentsOfFile: path)!
let html = NSString(data: data, encoding: NSUTF8StringEncoding)!
let doc = NDHpple(HTMLData: html)
if let elements = doc.searchWithXPathQuery("//a/#href[ends-with(.,'.txt')]") {
for element in elements {
println(element.children?.first?.content)
}
}
Kanna (Xpath and CSS Selectors):
let html = "<html><head></head><body><ul><li><input type='image' name='input1' value='string1value' class='abc' /></li><li><input type='image' name='input2' value='string2value' class='def' /></li></ul><span class='spantext'><b>Hello World 1</b></span><span class='spantext'><b>Hello World 2</b></span><a href='example.com'>example(English)</a><a href='example.co.jp'>example(JP)</a></body>"
if let doc = Kanna.HTML(html: html, encoding: NSUTF8StringEncoding) {
var bodyNode = doc.body
if let inputNodes = bodyNode?.xpath("//a/#href[ends-with(.,'.txt')]") {
for node in inputNodes {
println(node.contents)
}
}
}
Fuzi (Xpath and CSS Selectors):
let html = "<html><head></head><body><ul><li><input type='image' name='input1' value='string1value' class='abc' /></li><li><input type='image' name='input2' value='string2value' class='def' /></li></ul><span class='spantext'><b>Hello World 1</b></span><span class='spantext'><b>Hello World 2</b></span><a href='example.com'>example(English)</a><a href='example.co.jp'>example(JP)</a></body>"
do {
// if encoding is omitted, it defaults to NSUTF8StringEncoding
let doc = try HTMLDocument(string: html, encoding: NSUTF8StringEncoding)
// XPath queries
for anchor in doc.xpath("//a/#href[ends-with(.,'.txt')]") {
print(anchor.stringValue)
}
} catch let error {
print(error)
}
The ends-with function is part of Xpath 2.0.
SwiftSoup (CSS Selectors):
do{
let doc: Document = try SwiftSoup.parse("...")
let links: Elements = try doc.select("a[href]") // a with href
let pngs: Elements = try doc.select("img[src$=.png]")
// img with src ending .png
let masthead: Element? = try doc.select("div.masthead").first()
// div with class=masthead
let resultLinks: Elements? = try doc.select("h3.r > a") // direct a after h3
} catch Exception.Error(let type, let message){
print(message)
} catch {
print("error")
}
Ji (XPath):
let jiDoc = Ji(htmlURL: URL(string: "http://www.apple.com/support")!)
let titleNode = jiDoc?.xPath("//head/title")?.first
print("title: \(titleNode?.content)") // title: Optional("Official Apple Support")

Try SwiftSoup, a port of jsoup to Swift.
let html: String = "<a id=1 href='?foo=bar&mid&lt=true'>One</a> <a id=2 href='?foo=bar<qux&lg=1'>Two</a>";
let els: Elements = try SwiftSoup.parse(html).select("a");
for element: Element in els.array(){
print(try element.attr("href"))
}

You could try this swift-html-parser:
https://github.com/tid-kijyun/Swift-HTML-Parser
It helps a lot.
And for getting your html from a txt you can:
let file = "file.txt"
if let dirs : [String] = NSSearchPathForDirectoriesInDomains(NSSearchPathDirectory.DocumentDirectory, NSSearchPathDomainMask.AllDomainsMask, true) as? [String] {
let dir = dirs[0] //documents directory
let path = dir.stringByAppendingPathComponent(file);
let html = String(contentsOfFile: path, encoding: NSUTF8StringEncoding, error: nil)
Edit:
To get what you need you could use as the exemple:
import Foundation
let html = "theHtmlYouWannaParse"
var err : NSError?
var parser = HTMLParser(html: html, error: &err)
if err != nil {
println(err)
exit(1)
}
var bodyNode = parser.body
if let inputNodes = bodyNode?.findChildTags("b") {
for node in inputNodes {
println(node.contents)
}
}
if let inputNodes = bodyNode?.findChildTags("a") {
for node in inputNodes {
println(node.getAttributeNamed("href")) //<- Here you would get your files link
}
}

Related

How to get imageURL out of HTML tag

I need to strip out the HTML tag from the URL which i m getting in the response from server.
I have tried following code but it's not helping me. Any Ideas??
The String i m geting
"test Discussion for images"
I have added the extension to retrieve the HTTP string but i m not getting URL accurate.
extension String {
func removeHTMLTag() -> String {
return self.replacingOccurrences(of: "<[^>]+>", with: "", options: String.CompareOptions.regularExpression, range: nil)
}
}
Using SwiftSoup you can get imageURL
let doc = try SwiftSoup.parse("<div id=div2><p>How are you?</p><div id=div3><img src=http://example.com/test.png></div></div>");
for element in try doc.select("img").array(){
try print(element.attr("src"))
}
**Output**
//http://example.com/test.png
You can try this.
func removeHTMLTags(for regex: String!, in text: String!) -> [String] {
do {
let regex = try NSRegularExpression(pattern: regex, options: [])
let nsString = text as NSString
let results = regex.matches(in: text, range: NSMakeRange(0, nsString.length))
return results.map { nsString.substring(with: $0.range)}
} catch let error as NSError {
print("invalid regex: \(error.localizedDescription)")
return []
}
}
Call this function -
let text = "test Discussion for images<div><img src=\"http:\\\\45.35.4.250\\MvestUploadContainer\\7e3ba5ba-000b-4dc4-ae6e-a4bc8d59088c.png\"><br></div>"
let url = removeHTMLTags(for: "(http[^\\s]+(jpg|jpeg|png)\\b)", in: String(text))
print(url[0])
Output - http:\\45.35.4.250\MvestUploadContainer\7e3ba5ba-000b-4dc4-ae6e-a4bc8d59088c.png

Extract JSON string from html only using iOS API

I want to extract JSON string from html document "without" using third party Framework.
I'm trying to create iOS framework and I do not want to use third party Framework in it.
Example url:
http://www.nicovideo.jp/watch/sm33786214
In that html, there is a line:
I need to extract:
JSON_String_I_want_to extract
and convert it to JSON object.
With third party framework "Kanna", it is like this:
if let doc = Kanna.HTML(html: html, encoding: String.Encoding.utf8) {
if let descNode = doc.css("#js-initial-watch-data[data-api-data]").first {
let dataApiData = descNode["data-api-data"]
if let data = dataApiData?.data(using: .utf8) {
if let json = try? JSON(data: data, options: JSONSerialization.ReadingOptions.mutableContainers) {
I searched the web with similar question but unable to apply to my case:(I need to admit I'm not quite following regular expression)
if let html = String(data:data, encoding:.utf8) {
let pattern = "data-api-data=\"(.*?)\".*?>"
let regex = try! NSRegularExpression(pattern: pattern, options: .caseInsensitive)
let matches = regex.matches(in: html, options: [], range: NSMakeRange(0, html.count))
var results: [String] = []
matches.forEach { (match) -> () in
results.append( (html as NSString).substring(with: match.rangeAt(1)) )
}
if let stringJSON = results.first {
let d = stringJSON.data(using: String.Encoding.utf8)
if let json = try? JSONSerialization.jsonObject(with: d!, options: []) as? Any {
// it does not get here...
}
Anyone expert in extracting from html and convert it to JSON?
Thank you.
Your pattern does not seem to be bad, just that attribute values of HTML Elements may be using character entities.
You need to replace them into actual characters before parsing the String as JSON.
if let html = String(data:data, encoding: .utf8) {
let pattern = "data-api-data=\"([^\"]*)\""
let regex = try! NSRegularExpression(pattern: pattern, options: .caseInsensitive)
let matches = regex.matches(in: html, range: NSRange(0..<html.utf16.count)) //<-USE html.utf16.count, NOT html.count
var results: [String] = []
matches.forEach {match in
let propValue = html[Range(match.range(at: 1), in: html)!]
//### You need to replace character entities into actual characters
.replacingOccurrences(of: """, with: "\"")
.replacingOccurrences(of: "&apos;", with: "'")
.replacingOccurrences(of: ">", with: ">")
.replacingOccurrences(of: "<", with: "<")
.replacingOccurrences(of: "&", with: "&")
results.append(propValue)
}
if let stringJSON = results.first {
let dataJSON = stringJSON.data(using: .utf8)!
do {
let json = try JSONSerialization.jsonObject(with: dataJSON)
print(json)
} catch {
print(error) //You should not ignore errors silently...
}
} else {
print("NO result")
}
}

Swift: how to display image from html source

How can I display image from html source in some of sites in swift 2.2?
Actually I don't have any JSON or XML.
The important thing is that I have to use regex.
I tried this:
if htmlContent != nil
{
let htmlContent = (item?.htmlContent)! as NSString
var imageSource = ""
let rangeOfString = NSMakeRange(0, htmlContent.length)
let regex = try! NSRegularExpression(pattern: "(<img.*?src=\")(.*?)(\".*?>)", options: [.CaseInsensitive])
if htmlContent.length > 0
{
let match = regex.firstMatchInString(htmlContent as String, options: [.WithTransparentBounds], range: rangeOfString)
if match != nil
{
let imageURL = htmlContent.substringWithRange(match!.rangeAtIndex(2)) as NSString
print(imageURL)
if NSString(string: imageURL.lowercaseString).rangeOfString("feedBurner").location == NSNotFound
{
imageSource = imageURL as String
}
}
}
if imageSource != ""
{
imgHTMLLoader.setImageWithURL(NSURL(fileURLWithPath: imageSource), placeholderImage: UIImage(named: "placeholder"))
print("placeholderImage is not! nil")
}
else
{
imgHTMLLoader.image = UIImage(named: "placeholder")
print("placeholderImage is nil")
}
}
in this sample(library)... htmlContent always is nil.
this sample , use "Helper library" but it dosn't work...
thanks
Using SwiftSoup third party library and Swift 3.0
let doc = try SwiftSoup.parse("<div id=div1><p>Hello</p><p>Another <b>element</b></p><div id=div2><img src=foo.png></div></div>");
for element in try doc.select("img").array(){
try print(element.attr("src"))
}
//foo.png

how to load local html file to string variable in ios swift?

i want to load my local html file into a string variable. i don't know how to do it. please help me.
i found below link but it load it from online url.
Swift & UIWebView element to hide
Copy the html into your project directory and use this code :
#IBOutlet weak var webView: UIWebView!
override func viewDidLoad() {
super.viewDidLoad()
var htmlFile = NSBundle.mainBundle().pathForResource("MyHtmlFile", ofType: "html")
var htmlString = try? String(contentsOfFile: htmlFile!, encoding: NSUTF8StringEncoding)
webView.loadHTMLString(htmlString!, baseURL: nil)
}
In Swift 3:
let htmlFile = Bundle.main.path(forResource:"MyHtmlFile", ofType: "html")
let htmlString = try? String(contentsOfFile: htmlFile!, encoding: String.Encoding.utf8)
webView.loadHTMLString(htmlString!, baseURL: nil)
I recommend using optional chaining instead of force unwrapping htmlString as above.
You can write some utility method like this and get the html string and load it to webview. I used the URL approach here
private func getHTML() -> String {
var html = ""
if let htmlPathURL = Bundle.main.url(forResource: "test", withExtension: "html"){
do {
html = try String(contentsOf: htmlPathURL, encoding: .utf8)
} catch {
print("Unable to get the file.")
}
}
return html
}
Swift 3 syntax:
guard
let file = Bundle.main.path(forResource: "agreement", ofType: "html"),
let html = try? String(contentsOfFile: file, encoding: String.Encoding.utf8)
else {
return
}
webView.loadHTMLString(html, baseURL: nil)
Simple answer
let indexPath = Bundle.main.path(forResource: "index", ofType: "html", inDirectory: "/")
if let indexPath = indexPath
{
do
{
let htmlContent = try String(contentsOfFile: indexPath, encoding: String.Encoding.utf8)
let base = Bundle.main.resourceURL
self.webView.loadHTMLString(htmlContent, baseURL: base)
}
catch let err as NSError
{
print(err.debugDescription)
}
}

A swift wrapper around libxml for parsing HTML

I'm getting an odd error message saying "Extra argument 'endocing' in call", but it's in the method, so it's not an extra argument? Why is this happening and how can I resolve this? The error message appears when declaring the variable "parser" as you can see. Thanks!
if let checkedUrl = NSURL(string:"http://www.mobladet.se") {
if let htmlString = String(contentsOfURL: checkedUrl, encoding: NSUTF8StringEncoding, error: nil) {
// Parsing HTML
let opt = CInt(HTML_PARSE_NOERROR.value | HTML_PARSE_RECOVER.value)
var err : NSError?
var parser = HTMLParser(html: htmlString, encoding: NSUTF8StringEncoding, option: opt, error: &err)
var bodyNode = parser.body
// Create an array of the part of HTML you need
if let inputNodes = bodyNode?.findChildTags("h4") {
for node in inputNodes {
let result = html2String(node.rawContents)
println(result)
}
}
} else {
println("Could not load HTML Content")
}
}
html should be HTML code to be parsed not a NSURL. You need to use String( contentsOfURL:) to extract its contents and them parse it
if let checkedUrl = NSURL(string:"http://stackoverflow.com/questions/28751228/a-swift-wrapper-around-libxml-for-parsing-html"){
if let htmlString = String(contentsOfURL: checkedUrl, encoding: NSUTF8StringEncoding, error: nil) {
println(htmlString)
} else {
println("could not load html string from the url")
}
} else {
println("invalid url")
}