@@ -15,8 +15,9 @@ defmodule Mudbrick.Parser do
1515 }
1616
1717 @ doc """
18- Parse Mudbrick-generated `iodata` into a Mudbrick document .
18+ Parse Mudbrick-generated `iodata` into a ` Mudbrick.Document` .
1919 """
20+ @ spec parse ( iodata ( ) ) :: Mudbrick.Document . t ( )
2021 def parse ( iodata ) do
2122 { :ok , parsed_items , _rest , % { } , _ , _ } =
2223 iodata
@@ -48,9 +49,10 @@ defmodule Mudbrick.Parser do
4849 end
4950
5051 @ doc """
51- Parse a section of a Mudbrick-generated PDF with a specific parsing function.
52+ Parse a section of a Mudbrick-generated PDF with a named parsing function.
5253 Mostly useful for debugging this parser.
5354 """
55+ @ spec parse ( iodata ( ) , atom ( ) ) :: term ( )
5456 def parse ( iodata , f ) do
5557 case iodata
5658 |> IO . iodata_to_binary ( )
@@ -59,6 +61,75 @@ defmodule Mudbrick.Parser do
5961 end
6062 end
6163
64+ @ doc """
65+ Extract text content from a Mudbrick-generated PDF. Will map glyphs back to
66+ their original characters.
67+
68+ ## With compression
69+
70+ iex> import Mudbrick.TestHelper
71+ ...> import Mudbrick
72+ ...> new(compress: true, fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
73+ ...> |> page()
74+ ...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
75+ ...> |> text("hello in another font", font: :franklin)
76+ ...> |> Mudbrick.render()
77+ ...> |> Mudbrick.Parser.extract_text()
78+ [ "hello, world!", "hello in another font" ]
79+
80+ ## Without compression
81+
82+ iex> import Mudbrick.TestHelper
83+ ...> import Mudbrick
84+ ...> new(fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
85+ ...> |> page()
86+ ...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
87+ ...> |> text("hello in another font", font: :franklin)
88+ ...> |> Mudbrick.render()
89+ ...> |> Mudbrick.Parser.extract_text()
90+ [ "hello, world!", "hello in another font" ]
91+
92+ """
93+ @ spec extract_text ( iodata ( ) ) :: [ String . t ( ) ]
94+ def extract_text ( iodata ) do
95+ alias Mudbrick.ContentStream . { Tf , TJ }
96+
97+ doc = parse ( iodata )
98+
99+ content_stream =
100+ Mudbrick.Document . find_object ( doc , & match? ( % Mudbrick.ContentStream { } , & 1 ) )
101+
102+ page_tree = Mudbrick.Document . root_page_tree ( doc )
103+ fonts = page_tree . value . fonts
104+
105+ { text_items , _last_found_font } =
106+ content_stream . value . operations
107+ |> List . foldr ( { [ ] , nil } , fn
108+ % Tf { font_identifier: font_identifier } , { text_items , _current_font } ->
109+ { text_items , Map . fetch! ( fonts , font_identifier ) . value . parsed }
110+
111+ % TJ { kerned_text: kerned_text } , { text_items , current_font } ->
112+ text =
113+ kerned_text
114+ |> Enum . map ( fn
115+ { hex_glyph , _kern } -> hex_glyph
116+ hex_glyph -> hex_glyph
117+ end )
118+ |> Enum . map ( fn hex_glyph ->
119+ { decimal_glyph , _ } = Integer . parse ( hex_glyph , 16 )
120+ Map . fetch! ( current_font . gid2cid , decimal_glyph )
121+ end )
122+ |> to_string ( )
123+
124+ { [ text | text_items ] , current_font }
125+
126+ _operation , { text_items , current_font } ->
127+ { text_items , current_font }
128+ end )
129+
130+ Enum . reverse ( text_items )
131+ end
132+
62133 @ doc false
63134 defparsec ( :boolean , boolean ( ) )
64135 @ doc false
@@ -181,74 +252,6 @@ defmodule Mudbrick.Parser do
181252 }
182253 end
183254
184- @ doc """
185- Extract text content from a Mudbrick-generated PDF. Will map glyphs back to
186- their original characters.
187-
188- ## With compression
189-
190- iex> import Mudbrick.TestHelper
191- ...> import Mudbrick
192- ...> new(compress: true, fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
193- ...> |> page()
194- ...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
195- ...> |> text("hello in another font", font: :franklin)
196- ...> |> Mudbrick.render()
197- ...> |> Mudbrick.Parser.extract_text()
198- [ "hello, world!", "hello in another font" ]
199-
200- ## Without compression
201-
202- iex> import Mudbrick.TestHelper
203- ...> import Mudbrick
204- ...> new(fonts: %{bodoni: bodoni_regular(), franklin: franklin_regular()})
205- ...> |> page()
206- ...> |> text({"hello, world!", underline: [width: 1]}, font: :bodoni)
207- ...> |> text("hello in another font", font: :franklin)
208- ...> |> Mudbrick.render()
209- ...> |> Mudbrick.Parser.extract_text()
210- [ "hello, world!", "hello in another font" ]
211-
212- """
213- def extract_text ( iodata ) do
214- alias Mudbrick.ContentStream . { Tf , TJ }
215-
216- doc = parse ( iodata )
217-
218- content_stream =
219- Mudbrick.Document . find_object ( doc , & match? ( % Mudbrick.ContentStream { } , & 1 ) )
220-
221- page_tree = Mudbrick.Document . root_page_tree ( doc )
222- fonts = page_tree . value . fonts
223-
224- { text_items , _last_found_font } =
225- content_stream . value . operations
226- |> List . foldr ( { [ ] , nil } , fn
227- % Tf { font_identifier: font_identifier } , { text_items , _current_font } ->
228- { text_items , Map . fetch! ( fonts , font_identifier ) . value . parsed }
229-
230- % TJ { kerned_text: kerned_text } , { text_items , current_font } ->
231- text =
232- kerned_text
233- |> Enum . map ( fn
234- { hex_glyph , _kern } -> hex_glyph
235- hex_glyph -> hex_glyph
236- end )
237- |> Enum . map ( fn hex_glyph ->
238- { decimal_glyph , _ } = Integer . parse ( hex_glyph , 16 )
239- Map . fetch! ( current_font . gid2cid , decimal_glyph )
240- end )
241- |> to_string ( )
242-
243- { [ text | text_items ] , current_font }
244-
245- _operation , { text_items , current_font } ->
246- { text_items , current_font }
247- end )
248-
249- Enum . reverse ( text_items )
250- end
251-
252255 @ doc false
253256 def to_mudbrick ( iodata , f ) ,
254257 do:
0 commit comments