Skip to content

Commit ea166a0

Browse files
committed
Specs and docs for parser
1 parent 9bc364b commit ea166a0

1 file changed

Lines changed: 73 additions & 70 deletions

File tree

lib/mudbrick/parser.ex

Lines changed: 73 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ defmodule Mudbrick.Parser do
1515
}
1616

1717
@doc """
18-
Parse Mudbrick-generated `iodata` into a Mudbrick document.
18+
Parse Mudbrick-generated `iodata` into a `Mudbrick.Document`.
1919
"""
20+
@spec parse(iodata()) :: Mudbrick.Document.t()
2021
def parse(iodata) do
2122
{:ok, parsed_items, _rest, %{}, _, _} =
2223
iodata
@@ -48,9 +49,10 @@ defmodule Mudbrick.Parser do
4849
end
4950

5051
@doc """
51-
Parse a section of a Mudbrick-generated PDF with a specific parsing function.
52+
Parse a section of a Mudbrick-generated PDF with a named parsing function.
5253
Mostly useful for debugging this parser.
5354
"""
55+
@spec parse(iodata(), atom()) :: term()
5456
def parse(iodata, f) do
5557
case iodata
5658
|> IO.iodata_to_binary()
@@ -59,6 +61,75 @@ defmodule Mudbrick.Parser do
5961
end
6062
end
6163

64+
@doc """
65+
Extract text content from a Mudbrick-generated PDF. Will map glyphs back to
66+
their original characters.
67+
68+
## With compression
69+
70+
iex> import Mudbrick.TestHelper
71+
...> import Mudbrick
72+
...> new(compress: true, fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
73+
...> |> page()
74+
...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
75+
...> |> text("hello in another font", font: :franklin)
76+
...> |> Mudbrick.render()
77+
...> |> Mudbrick.Parser.extract_text()
78+
[ "hello, world!", "hello in another font" ]
79+
80+
## Without compression
81+
82+
iex> import Mudbrick.TestHelper
83+
...> import Mudbrick
84+
...> new(fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
85+
...> |> page()
86+
...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
87+
...> |> text("hello in another font", font: :franklin)
88+
...> |> Mudbrick.render()
89+
...> |> Mudbrick.Parser.extract_text()
90+
[ "hello, world!", "hello in another font" ]
91+
92+
"""
93+
@spec extract_text(iodata()) :: [String.t()]
94+
def extract_text(iodata) do
95+
alias Mudbrick.ContentStream.{Tf, TJ}
96+
97+
doc = parse(iodata)
98+
99+
content_stream =
100+
Mudbrick.Document.find_object(doc, &match?(%Mudbrick.ContentStream{}, &1))
101+
102+
page_tree = Mudbrick.Document.root_page_tree(doc)
103+
fonts = page_tree.value.fonts
104+
105+
{text_items, _last_found_font} =
106+
content_stream.value.operations
107+
|> List.foldr({[], nil}, fn
108+
%Tf{font_identifier: font_identifier}, {text_items, _current_font} ->
109+
{text_items, Map.fetch!(fonts, font_identifier).value.parsed}
110+
111+
%TJ{kerned_text: kerned_text}, {text_items, current_font} ->
112+
text =
113+
kerned_text
114+
|> Enum.map(fn
115+
{hex_glyph, _kern} -> hex_glyph
116+
hex_glyph -> hex_glyph
117+
end)
118+
|> Enum.map(fn hex_glyph ->
119+
{decimal_glyph, _} = Integer.parse(hex_glyph, 16)
120+
Map.fetch!(current_font.gid2cid, decimal_glyph)
121+
end)
122+
|> to_string()
123+
124+
{[text | text_items], current_font}
125+
126+
_operation, {text_items, current_font} ->
127+
{text_items, current_font}
128+
end)
129+
130+
Enum.reverse(text_items)
131+
end
132+
62133
@doc false
63134
defparsec(:boolean, boolean())
64135
@doc false
@@ -181,74 +252,6 @@ defmodule Mudbrick.Parser do
181252
}
182253
end
183254

184-
@doc """
185-
Extract text content from a Mudbrick-generated PDF. Will map glyphs back to
186-
their original characters.
187-
188-
## With compression
189-
190-
iex> import Mudbrick.TestHelper
191-
...> import Mudbrick
192-
...> new(compress: true, fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
193-
...> |> page()
194-
...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
195-
...> |> text("hello in another font", font: :franklin)
196-
...> |> Mudbrick.render()
197-
...> |> Mudbrick.Parser.extract_text()
198-
[ "hello, world!", "hello in another font" ]
199-
200-
## Without compression
201-
202-
iex> import Mudbrick.TestHelper
203-
...> import Mudbrick
204-
...> new(fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
205-
...> |> page()
206-
...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
207-
...> |> text("hello in another font", font: :franklin)
208-
...> |> Mudbrick.render()
209-
...> |> Mudbrick.Parser.extract_text()
210-
[ "hello, world!", "hello in another font" ]
211-
212-
"""
213-
def extract_text(iodata) do
214-
alias Mudbrick.ContentStream.{Tf, TJ}
215-
216-
doc = parse(iodata)
217-
218-
content_stream =
219-
Mudbrick.Document.find_object(doc, &match?(%Mudbrick.ContentStream{}, &1))
220-
221-
page_tree = Mudbrick.Document.root_page_tree(doc)
222-
fonts = page_tree.value.fonts
223-
224-
{text_items, _last_found_font} =
225-
content_stream.value.operations
226-
|> List.foldr({[], nil}, fn
227-
%Tf{font_identifier: font_identifier}, {text_items, _current_font} ->
228-
{text_items, Map.fetch!(fonts, font_identifier).value.parsed}
229-
230-
%TJ{kerned_text: kerned_text}, {text_items, current_font} ->
231-
text =
232-
kerned_text
233-
|> Enum.map(fn
234-
{hex_glyph, _kern} -> hex_glyph
235-
hex_glyph -> hex_glyph
236-
end)
237-
|> Enum.map(fn hex_glyph ->
238-
{decimal_glyph, _} = Integer.parse(hex_glyph, 16)
239-
Map.fetch!(current_font.gid2cid, decimal_glyph)
240-
end)
241-
|> to_string()
242-
243-
{[text | text_items], current_font}
244-
245-
_operation, {text_items, current_font} ->
246-
{text_items, current_font}
247-
end)
248-
249-
Enum.reverse(text_items)
250-
end
251-
252255
@doc false
253256
def to_mudbrick(iodata, f),
254257
do:

0 commit comments

Comments
 (0)