Skip to content

Commit 4bbb637

Browse files
authored
Merge pull request #72 from mischov/feature/refactor-extractors
Refactor extractors
2 parents b4b68dd + c6fa8b8 commit 4bbb637

File tree

21 files changed

+905
-375
lines changed

21 files changed

+905
-375
lines changed

lib/meeseeks.ex

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -531,13 +531,24 @@ defmodule Meeseeks do
531531
Returns the combined data of a result or the result's children, which may
532532
be an empty string.
533533
534+
Once the data has been combined the whitespace is compacted by replacing
535+
all instances of more than one whitespace character with a single space
536+
and then trimmed.
537+
534538
Data is the content of `<script>` or `<style>` tags, or the content of
535539
comments starting with "[CDATA[" and ending with "]]". The latter behavior
536540
is to support the extraction of CDATA from HTML, since HTML5 parsers parse
537541
CDATA as comments.
538542
539543
Nil input returns `nil`.
540544
545+
## Options
546+
547+
* `:collapse_whitespace` - Boolean determining whether or not to replace
548+
blocks of whitespace with a single space character. Defaults to `true`.
549+
* `:trim` - Boolean determining whether or not to trim the resulting
550+
text. Defaults to `true`.
551+
541552
## Examples
542553
543554
iex> import Meeseeks.CSS
@@ -550,11 +561,11 @@ defmodule Meeseeks do
550561
iex> Meeseeks.data(result2)
551562
"Hi"
552563
"""
553-
@spec data(extractable) :: String.t() | nil
554-
def data(extractable)
555-
def data(nil), do: nil
556-
def data(%Result{} = result), do: Result.data(result)
557-
def data(x), do: raise_cannot_extract(x, "data/1")
564+
@spec data(extractable, Keyword.t()) :: String.t() | nil
565+
def data(extractable, opts \\ [])
566+
def data(nil, _), do: nil
567+
def data(%Result{} = result, opts), do: Result.data(result, opts)
568+
def data(x, _), do: raise_cannot_extract(x, "data/1")
558569

559570
@doc """
560571
Returns a map of a result's data attributes, or nil if the result
@@ -609,8 +620,19 @@ defmodule Meeseeks do
609620
Returns the combined text of a result or the result's children, which may
610621
be an empty string.
611622
623+
Once the text has been combined the whitespace is compacted by replacing
624+
all instances of more than one whitespace character with a single space
625+
and then trimmed.
626+
612627
Nil input returns `nil`.
613628
629+
## Options
630+
631+
* `:collapse_whitespace` - Boolean determining whether or not to replace
632+
blocks of whitespace with a single space character. Defaults to `true`.
633+
* `:trim` - Boolean determining whether or not to trim the resulting
634+
text. Defaults to `true`.
635+
614636
## Examples
615637
616638
iex> import Meeseeks.CSS
@@ -619,11 +641,11 @@ defmodule Meeseeks do
619641
iex> Meeseeks.own_text(result)
620642
"Hello,"
621643
"""
622-
@spec own_text(extractable) :: String.t() | nil
623-
def own_text(extractable)
624-
def own_text(nil), do: nil
625-
def own_text(%Result{} = result), do: Result.own_text(result)
626-
def own_text(x), do: raise_cannot_extract(x, "own_text/1")
644+
@spec own_text(extractable, Keyword.t()) :: String.t() | nil
645+
def own_text(extractable, opts \\ [])
646+
def own_text(nil, _), do: nil
647+
def own_text(%Result{} = result, opts), do: Result.own_text(result, opts)
648+
def own_text(x, _), do: raise_cannot_extract(x, "own_text/1")
627649

628650
@doc """
629651
Returns a result's tag, or `nil` if the result represents a node without a
@@ -649,8 +671,19 @@ defmodule Meeseeks do
649671
Returns the combined text of a result or the result's descendants, which
650672
may be an empty string.
651673
674+
Once the text has been combined the whitespace is compacted by replacing
675+
all instances of more than one whitespace character with a single space
676+
and then trimmed.
677+
652678
Nil input returns `nil`.
653679
680+
## Options
681+
682+
* `:collapse_whitespace` - Boolean determining whether or not to replace
683+
blocks of whitespace with a single space character. Defaults to `true`.
684+
* `:trim` - Boolean determining whether or not to trim the resulting
685+
text. Defaults to `true`.
686+
654687
## Examples
655688
656689
iex> import Meeseeks.CSS
@@ -659,11 +692,11 @@ defmodule Meeseeks do
659692
iex> Meeseeks.text(result)
660693
"Hello, World!"
661694
"""
662-
@spec text(extractable) :: String.t() | nil
663-
def text(extractable)
664-
def text(nil), do: nil
665-
def text(%Result{} = result), do: Result.text(result)
666-
def text(x), do: raise_cannot_extract(x, "text/1")
695+
@spec text(extractable, Keyword.t()) :: String.t() | nil
696+
def text(extractable, opts \\ [])
697+
def text(nil, _), do: nil
698+
def text(%Result{} = result, opts), do: Result.text(result, opts)
699+
def text(x, _), do: raise_cannot_extract(x, "text/1")
667700

668701
@doc """
669702
Returns the `Meeseeks.TupleTree` of a document or result and its

lib/meeseeks/document.ex

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ defmodule Meeseeks.Document do
5757
```
5858
"""
5959

60-
alias Meeseeks.{Document, Error}
60+
alias Meeseeks.{Document, Error, Extractor}
6161
alias Meeseeks.Document.{Element, Node}
6262

6363
defstruct id_counter: nil, roots: [], nodes: %{}
@@ -76,9 +76,9 @@ defmodule Meeseeks.Document do
7676
def html(%Document{} = document) do
7777
document
7878
|> get_root_nodes()
79-
|> Enum.reduce("", fn root_node, acc ->
80-
acc <> Node.html(root_node, document)
81-
end)
79+
|> Enum.map(&Extractor.Html.from_node(&1, document))
80+
|> IO.iodata_to_binary()
81+
|> String.trim()
8282
end
8383

8484
@doc """
@@ -87,7 +87,7 @@ defmodule Meeseeks.Document do
8787
def tree(%Document{} = document) do
8888
document
8989
|> get_root_nodes()
90-
|> Enum.map(&Node.tree(&1, document))
90+
|> Enum.map(&Extractor.Tree.from_node(&1, document))
9191
end
9292

9393
# Query

lib/meeseeks/document/comment.ex

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,6 @@
11
defmodule Meeseeks.Document.Comment do
2-
use Meeseeks.Document.Node
32
@moduledoc false
43

54
@enforce_keys [:id]
65
defstruct parent: nil, id: nil, content: ""
7-
8-
@impl true
9-
def html(node, _document) do
10-
"<!--#{node.content}-->"
11-
end
12-
13-
@impl true
14-
def tree(node, _document) do
15-
{:comment, node.content}
16-
end
176
end

lib/meeseeks/document/data.ex

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,6 @@
11
defmodule Meeseeks.Document.Data do
2-
use Meeseeks.Document.Node
32
@moduledoc false
43

5-
alias Meeseeks.Document.{Data, Helpers}
6-
74
@enforce_keys [:id]
85
defstruct parent: nil, id: nil, type: nil, content: ""
9-
10-
@impl true
11-
def data(node, _document) do
12-
Helpers.collapse_whitespace(node.content)
13-
end
14-
15-
@impl true
16-
def html(%Data{type: :cdata, content: content}, _document) do
17-
"<![CDATA[#{content}]]>"
18-
end
19-
20-
@impl true
21-
def html(node, _document) do
22-
node.content
23-
end
24-
25-
@impl true
26-
def tree(node, _document) do
27-
node.content
28-
end
296
end

lib/meeseeks/document/doctype.ex

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,6 @@
11
defmodule Meeseeks.Document.Doctype do
2-
use Meeseeks.Document.Node
32
@moduledoc false
43

54
@enforce_keys [:id]
65
defstruct parent: nil, id: nil, name: "", public: "", system: ""
7-
8-
@impl true
9-
def html(node, _document) do
10-
"<!DOCTYPE #{node.name}#{format_legacy(node.public, node.system)}>"
11-
end
12-
13-
@impl true
14-
def tree(node, _document) do
15-
{:doctype, node.name, node.public, node.system}
16-
end
17-
18-
defp format_legacy("", ""), do: ""
19-
defp format_legacy(public, ""), do: " PUBLIC \"#{public}\""
20-
defp format_legacy("", system), do: " SYSTEM \"#{system}\""
21-
defp format_legacy(public, system), do: " PUBLIC \"#{public}\" \"#{system}\""
226
end

lib/meeseeks/document/element.ex

Lines changed: 0 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -1,152 +1,6 @@
11
defmodule Meeseeks.Document.Element do
2-
use Meeseeks.Document.Node
32
@moduledoc false
43

5-
alias Meeseeks.Document
6-
alias Meeseeks.Document.Helpers
7-
84
@enforce_keys [:id]
95
defstruct parent: nil, id: nil, namespace: "", tag: "", attributes: [], children: []
10-
11-
@self_closing_tags [
12-
"area",
13-
"base",
14-
"br",
15-
"col",
16-
"command",
17-
"embed",
18-
"hr",
19-
"img",
20-
"input",
21-
"keygen",
22-
"link",
23-
"meta",
24-
"param",
25-
"source",
26-
"track",
27-
"wbr"
28-
]
29-
30-
@impl true
31-
def attr(node, attribute) do
32-
{_attr, value} = List.keyfind(node.attributes, attribute, 0, {nil, nil})
33-
value
34-
end
35-
36-
@impl true
37-
def attrs(node) do
38-
node.attributes
39-
end
40-
41-
@impl true
42-
def data(node, document) do
43-
child_nodes(document, node)
44-
|> Enum.filter(&data_node?/1)
45-
|> Enum.reduce("", &join_data(&1, &2, document))
46-
|> Helpers.collapse_whitespace()
47-
end
48-
49-
@impl true
50-
def html(node, document) do
51-
if node.tag in @self_closing_tags and node.children == [] do
52-
self_closing_tag(node)
53-
else
54-
opening_tag(node) <> child_html(node, document) <> closing_tag(node)
55-
end
56-
end
57-
58-
@impl true
59-
def own_text(node, document) do
60-
child_nodes(document, node)
61-
|> Enum.filter(&text_node?/1)
62-
|> Enum.reduce("", &join_text(&1, &2, document))
63-
|> Helpers.collapse_whitespace()
64-
end
65-
66-
@impl true
67-
def tag(node) do
68-
node.tag
69-
end
70-
71-
@impl true
72-
def text(node, document) do
73-
child_nodes(document, node)
74-
|> Enum.reduce("", &join_text(&1, &2, document))
75-
|> Helpers.collapse_whitespace()
76-
end
77-
78-
@impl true
79-
def tree(node, document) do
80-
child_nodes = child_nodes(document, node)
81-
{node.tag, node.attributes, Enum.map(child_nodes, &Document.Node.tree(&1, document))}
82-
end
83-
84-
# Helpers
85-
86-
defp self_closing_tag(node) do
87-
tag = full_tag(node.namespace, node.tag)
88-
attributes = join_attributes(node.attributes)
89-
"<#{tag}#{attributes} />"
90-
end
91-
92-
defp opening_tag(node) do
93-
tag = full_tag(node.namespace, node.tag)
94-
attributes = join_attributes(node.attributes)
95-
"<#{tag}#{attributes}>"
96-
end
97-
98-
defp child_html(node, document) do
99-
child_nodes(document, node)
100-
|> Enum.reduce("", &join_html(&1, &2, document))
101-
end
102-
103-
defp closing_tag(node) do
104-
tag = full_tag(node.namespace, node.tag)
105-
"</#{tag}>"
106-
end
107-
108-
defp full_tag("", tag), do: tag
109-
defp full_tag(ns, tag), do: ns <> ":" <> tag
110-
111-
defp child_nodes(document, node) do
112-
children = Document.children(document, node.id)
113-
114-
Document.get_nodes(document, children)
115-
end
116-
117-
defp data_node?(%Document.Data{}), do: true
118-
defp data_node?(_), do: false
119-
120-
defp text_node?(%Document.Text{}), do: true
121-
defp text_node?(_), do: false
122-
123-
defp join_attributes([]) do
124-
""
125-
end
126-
127-
defp join_attributes(attributes) do
128-
Enum.reduce(attributes, "", &join_attribute(&1, &2))
129-
end
130-
131-
defp join_attribute({attribute, value}, acc) do
132-
"#{acc} #{attribute}=\"#{Helpers.html_escape_attribute_value(value)}\""
133-
end
134-
135-
defp join_data(node, acc, document) do
136-
case Document.Node.data(node, document) do
137-
"" -> acc
138-
data -> "#{acc} #{data}"
139-
end
140-
end
141-
142-
defp join_html(node, acc, document) do
143-
acc <> Document.Node.html(node, document)
144-
end
145-
146-
defp join_text(node, acc, document) do
147-
case Document.Node.text(node, document) do
148-
"" -> acc
149-
text -> "#{acc} #{text}"
150-
end
151-
end
1526
end

0 commit comments

Comments
 (0)