Htmlkit swift query between CSS Parse elements [duplicate] - html

Parse between elements
eg
<span>7:33</span>AM </dd>\n
<dt>Dinner</dt>\n <dd id=\"Dinner\">\n <span>12:23</span>PM </dd>\n <dt>Lunch</dt>\n <dd id=\"Lunch\">\n <span>2:43</span>PM </dd>\n
how do I get "AM/PM" Values
let test: [String] = document.querySelectorAll("span").compactMap({element in
guard let span = document.querySelector("dt") else {
return nil
}
return span.elementId
})
this is just looping 7:33 nine time :(

you can solve this the same way you would do it on any other browser. The problem is not HTMLKit specific.
Since there is no way to select a HTML Text Node via CSS, you have to select its parent and then access the text via the textContent property or access the parent node's child nodes.
So here are some options to solve your problem, using HTMLKit as an example and the following sample DOM:
let html = """
<html>
<body>
<dl>
<dt>Breakfast</dt>
<dd id="Breakfast"><span>10:00</span>AM</dd>
<dt>Dinner</dt>
<dd id="Dinner"><span>12:23</span>PM</dd>
</dl>
</body>
</html>
"""
let doc = HTMLDocument(string: html)
let elements = doc.querySelectorAll("dd")
Option 1: Select the dd elements and access the textContent
elements.forEach { ddElement in
print(ddElement.textContent)
}
// Would produce:
// 10:00AM
// 12:23PM
Option 2: Select the dd elements and iterate through their child nodes, while filtering out everything except for HTMLText nodes. Additionally you can provide your own custom filter:
elements.forEach { ddElement in
let iter: HTMLNodeIterator = ddElement.nodeIterator(showOptions: [.text], filter: nil)
iter.forEach { node in
let textNode = node as! HTMLText
print(textNode.textContent)
}
}
// Would produce:
// 10:00
// AM
// 12:23
// PM
Option 3: Expanding on the previous option, you can provide a custom filter for the node iterator:
for dd in elements {
let iter: HTMLNodeIterator = dd.nodeIterator(showOptions: [.text]) { node in
if !node.textContent.contains("AM") && !node.textContent.contains("PM") {
return .reject
}
return .accept
}
iter.forEach { node in
let textNode = node as! HTMLText
print(textNode.textContent)
}
}
// Would produce:
// AM
// PM
Option 4: Wrap the AM and PM in their own <span> elements and access those, e.g. with dd > span selector:
doc.querySelectorAll("dd > span").forEach { elem in
print(elem.textContent)
}
// Given the sample DOM would produce:
// 10:00
// 12:23
// if you wrap the am/pm in spans then you would also get those in the output
Your snippet produces: ["", ""] with the sample DOM from above. Here is why:
let test: [String] = doc.querySelectorAll("span")
.compactMap { element in // element is a <span> HTMLElement
// However the elements returned here are <dt> elements and not <span>
guard let span = doc.querySelector("dt") else {
return nil
}
// The <dt> elements in the DOM do not have IDs, hence an empty string is returned
return span.elementId
}
I hope this helps and clarifies some things.

Related

How to replace words in Google doc while maintaing text style?

I want to scan words in a Google doc from left to right and replace the first occurrences of some keywords with a URL or a bbcode like tag wrapper around them.
I cannot use findText API because it's not simple regex finding but complex pattern matching involving lots of if else conditions involving business logic.
Here is how I want to solve this
let document = DocumentApp.getActiveDocument().getBody();
let paragraph = document.getParagraphs()[0];
let contents = paragraph.getText();
// makeAllTheNecessaryReplacemens has all the business logic to identify which keywords need to changed
let newContents = makeAllTheNecessaryReplacemens(contents);
paragraph.setText(newContents);
The problem here is that text style gets wiped out and also makeAllTheNecessaryReplacemens cannot add hyperlinks to string text.
Please suggest a way to do this.
Proposed function
/**
* This is a wrapper around the attribute functions
* this allows setting one attribute at a time
* based of a complete attribute object obtained
* from another element. This makes it far more
* reliable.
*/
const attributeKey = {
FONT_SIZE : (o,s,e,a) => o.setFontSize(s,e,a),
STRIKETHROUGH : (o,s,e,a) => o.setStrikethrough(s,e,a),
FOREGROUND_COLOR : (o,s,e,a) => o.setForegroundColor(s,e,a),
LINK_URL : (o,s,e,a) => o.setLinkUrl(s,e,a),
UNDERLINE : (o,s,e,a) => o.setUnderline(s,e,a),
BOLD : (o,s,e,a) => o.setBold(s,e,a),
ITALIC : (o,s,e,a) => o.setItalic(s,e,a),
BACKGROUND_COLOR : (o,s,e,a) => o.setBackgroundColor(s,e,a),
FONT_FAMILY : (o,s,e,a) => o.setFontFamily(s,e,a)
}
/**
* Replace textToReplace with replacementText
* Will reatain formatting and hyperlinks
*/
function replaceTextPlus(textToReplace, replacementText) {
// Initializing
let body = DocumentApp.getActiveDocument().getBody();
let searchResult = body.findText(textToReplace);
while (searchResult != null) {
// Getting info about result
let foundElement = searchResult.getElement();
let start = searchResult.getStartOffset();
let end = searchResult.getEndOffsetInclusive();
// This returns a complete attributes object
// Many attributes have null as a value
let attributes = foundElement.getAttributes(start);
// Replacing text
foundElement.deleteText(start, end);
foundElement.insertText(start, replacementText);
// Setting new end index
let newEnd = start + replacementText.length - 1
// Set attributes for new text skipping over null values
// This requires the constant defined at the top.
for (let a in attributes) {
if (attributes[a] != null) {
attributeKey[a](foundElement, start, newEnd, attributes[a]);
}
}
// Modifies the actual searchResult so that the next findText
// starts at the NEW end index.
try {
let rangeBuilder = DocumentApp.getActiveDocument().newRange();
rangeBuilder.addElement(foundElement, start, newEnd);
searchResult = rangeBuilder.getRangeElements()[0];
} catch (e){
Logger.log("End of Document")
return null
}
// searches for next result
searchResult = body.findText(textToReplace, searchResult);
}
}
Extending the findText API
This function relies on the findText API, but it adds in a few more steps.
Find the text.
Get the element containing the text.
Get the start and end indices of the text.
Get the attributes of the text (font, color, hyperlink etc)
Replace the text.
Update the end index.
Use the old attributes to update the new text.
You call it like this:
replaceTextPlus("Bing", "Google")
replaceTextPlus("occurrences", "happenings")
replaceTextPlus("text", "prefixedtext")
How to set the formatting and link attributes.
This relies on the attributes object that gets returned from getAttributes. Which looks something like this:
{
FOREGROUND_COLOR=#ff0000,
LINK_URL=null,
FONT_SIZE=null,
ITALIC=true,
STRIKETHROUGH=null,
FONT_FAMILY=null,
BOLD=null,
UNDERLINE=true,
BACKGROUND_COLOR=null
}
I tried to use setAttributes but it was very unreliable. Using this method almost always resulted in some formatting loss.
To fix this I make an object attributeKey that wraps all the different functions for setting individual attributes, so that they can be called from this loop:
for (let a in attributes) {
if (attributes[a] != null) {
attributeKey[a](foundElement, start, newEnd, attributes[a]);
}
}
This allows null values to be skipped which seems to have solved the unreliability problem. Perhaps the update buffer gets confused with many values.
Limitations
This function gets the formatting of the first character of the found word. If the same work has different formatting within itself. For example, "Hello" (Mixed normal with bold and italic), the replacement word will have the formatting of the first letter. This could potentially be fixed by identifying the word and iterating over every single letter.
References
Text class
Body class
DocumentApp
Element Interface
Attribute Enum

Creating an html list rendering from indented text tree

I'm trying to create an html page out indented text.
For examle:
text file:
1. hello
- stack
- overflow
- how
- are you
Will come out as:
<ol>
<il>hello</li>
<ul>
<li>stack</li> ...
so it will render as an indented list.
I thought it would be best to create a node tree inspired by this answer for a similar problem in Python
Here's my cloned struct from the link in Go which doesn't work as intended, it gets stuck in the recursion for some reason:
func (n *node) addChildren(nodes []node) {
childLevel := nodes[0].textStart
for len(nodes) > 0 {
// pop
tempNode := nodes[0]
nodes = nodes[1:]
if tempNode.textStart == childLevel {
n.children = append(n.children, tempNode)
} else if tempNode.textStart > childLevel {
nodes = append([]node{tempNode}, nodes...)
n.children[len(n.children)-1].addChildren(nodes)
} else if tempNode.textStart <= n.textStart {
nodes = append([]node{tempNode}, nodes...)
return
}
}
}
I have found Markdown
As an optimal tool for the task!

How to create a query selector out of a selected element?

In google chrome, especially now with custom elements, it became very cumbersome to select and element by hand nowadays, even though the browser knows the whole path to it already. Or is there a way that leads to a query selected for an element that I'm inspecting?
Situation:
What chrome can tell me:
What chrome is unable to create for me AFAIK:
While building an chrome extension I have found a need to uniquely locate an element when returning to a page. To do this I needed to create a query string for a selected element (custom context menu click)
While searching for a solution I found this unanswered question.
As I could not find an off the shelf solution or API to do the task I wrote the following function. It is untested in the wild, is very rough and ready (using poor node traversing techniques). I posted it in this state lest I forget and this question remains unanswered.
Create Query String For Element
A function to build a query string that will uniquely locate an element from a reference of the element.
const querytStr = createQueryStringForElement(myElement); // return string or undefined
if (querytStr) {
const element = document.querySelector(queryStr);
console.log(element === myElement); // expected result true
}
If the function fails to create a query that uniquely locates an element it returns undefined. else it returns the query string.
Example results
"#editor > div.ace_scroller > div.ace_content > div.ace_layer.ace_text-layer > div.ace_line:nth-child(45) > span.ace_punctuation.ace_operator"
"#buttons" // A UI container
"#buttons > div.buttons" // A sub UI container
"#buttons > div.buttons:nth-child(2)" // A button element by position
"#buttons > div.buttons:nth-child(3)" // A button element by position
How it works
The code assumes that the page is well formed (ids must be unique).
The query string will try to start with an id eg "#elementId" but if an element has no id the query will use the tag and class names. eg "div.my-class".
The tag and class name may not uniquely identify the element. To check if the query is unique, the query string is used to query the DOM from the elements parent.
If needed the query string will use the elements position to refine the query "div.my-class:nth-child(2)". Unfortunately this makes the resultant query string insensitive to changes in element order.
The query string is built up along each parent until it finds an element with an id or there are no more parents.
The final step uses the query to see if the query finds the correct element returning the query if successful.
The code
function createQueryStringForElement(element) {
const getElementSel = element => {
const tName = element.tagName.toLowerCase();
var i = 0, str = element.id ? "#" + element.id : sel = tName;
if (str.includes("#")) { return str}
str += element.classList.length ? "." + [...element.classList.values()].join(".") : "";
if (element.parentElement) {
const res = element.parentElement.querySelector(str);
if (res !== element) {
while (i < element.parentElement.children.length) {
if (element.parentElement.children[i] === element) {
i > 0 && (str += ":nth-child(" + (i + 1) + ")" );
break;
}
i++;
}
}
}
return str;
}
const queryPath = [];
const original = element;
do {
const subQuery = getElementSel(element);
queryPath.push(subQuery);
if (subQuery[0] === "#") { break }
element = element.parentElement;
} while (element);
const query = queryPath.reverse().join(" > ");
try {
const els = document.querySelector(query);
if (els === original) { return query }
} catch(e) { }
}

How can I match text that precedes an `<a>` tag then return the `<a>` node?

I have the following, where I am trying to only capture the second case, where the text matches But I want this one here. Currently, it captures both cases.
package main
import (
"bytes"
"fmt"
"io"
"strings"
"golang.org/x/net/html"
)
func getTag(doc *html.Node, tag string) []*html.Node {
var nodes []*html.Node
var crawler func(*html.Node)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == tag {
nodes = append(nodes, node)
return
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return nodes
}
func main() {
doc, _ := html.Parse(strings.NewReader(testHTML))
nodes := getTag(doc, "a")
var buf bytes.Buffer
w := io.Writer(&buf)
for i, node := range nodes {
html.Render(w, node)
if i < (len(nodes) - 1) {
w.Write([]byte("\n"))
}
}
fmt.Println(buf.String())
}
var testHTML = `<html><body>
I do not want this link here link text
But I want this one here more link text
</body></html>`
This outputs:
link text
more link text
I would like to match specific text that precedes an <a> tag and, if it matches, return the <a> node. For instance, pass in But I want this one here and it returns more link text. I've been told to not parse html with regex but now I am stuck.
You're actually pretty close, because you are already using a proper parser (html.Parse from golang.org/x/net/html).
The trick here is that the various elements of the page are bound together conveniently, so you can use your existing crawler code with a later filtering function, if you like. (You could instead combine the filtering directly into the crawler.)
Each n *html.ElementNode is preceded by something unless it's the initial element in a block (first of a document or first child node), and that something is in n.PrevSibling. If its type is html.TextNode you have a sequence of the form:
some text<a ...>thing</a>
and you can examine the "some text" in the previous node:
func wanted(re *regexp.Regexp, n *html.Node) bool {
if n.PrevSibling == nil || n.PrevSibling.Type != html.TextNode {
return false
}
return re.MatchString(n.PrevSibling.Data)
}
This won't be perfect, because you could have, e.g.:
text <font></font> broken <font></font>uplast link
and the code will try to match against the string up, when you probably should put the text together into text broken up and pass that to the matcher. See more complete example here.

How to extract text outside of a html tag using jsoup?

I have the following HTML code:
Data
<div class="alg"></div>
<div class="alg"></div>
Pepsi
791
<div class="alg"></div>
<div class="alg"></div>
Coke
700
<div class="gap"></div>
<div class="gap"></div>
I want to extract all values Coke,700,pepsi,791. I tried the following code using Jsoup:
Document doc = Jsoup.parse(html);
for( Element element : doc.select("div.alg") ) // Select all the div tags
{
TextNode next = (TextNode) element.nextSibling(); // Get the next node of each div as a TextNode
System.out.println(next.text()); // Print the text of the TextNode
}
But the above code always print "" empty string.
Try this:
Document doc = Jsoup.parse(url, 30000);
for( Element element : doc.select(".gap") ) { // Select all the div tags
Node next = element.nextSibling();
StringBuffer sb = new StringBuffer();
while (next instanceof TextNode) {
sb.append(((TextNode)next).text());
next = next.nextSibling();
}
System.out.println(sb.toString()); // Print the text of the TextNode
}