Skip to content

Commit 37837e8

Browse files
feat: simplify arguments to avoid repetition
1 parent e683740 commit 37837e8

File tree

4 files changed

+83
-36
lines changed

4 files changed

+83
-36
lines changed

README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,17 @@ end
5656

5757
## Usage
5858

59-
Extract text from specific regions using bounding boxes `[x0, y0, x1, y1]`:
59+
Extract text from specific regions using bounding boxes `{x0, y0, x1, y1}`:
6060

6161
```elixir
62-
pages = [0, 1, 2] # zero based index
6362
areas = %{
6463
0 => {0, 0, 300, 200}, # Top-left area of page 0
6564
1 => [
6665
{200, 300, 600, 500}, # Bottom-right area of page 1
6766
{0, 0, 200, 250}, # Top-left area of page 1
6867
]
6968
}
70-
PdfExtractor.extract_text("path/to/document.pdf", pages, areas)
69+
PdfExtractor.extract_text("path/to/document.pdf", areas)
7170
```
7271

7372
### Return Format
@@ -90,4 +89,3 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
9089

9190
- Built on top of the excellent [pdfplumber](https://github.com/jsvine/pdfplumber) Python library
9291
- Uses [pythonx](https://github.com/livebook-dev/pythonx) for seamless Python integration
93-

lib/pdf_extractor.ex

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ defmodule PdfExtractor do
1111

1212
# Client
1313

14-
def start_link([] = _opts \\ []) do
15-
GenServer.start_link(__MODULE__, [], name: __MODULE__)
14+
def start_link(opts \\ []) do
15+
opts = Keyword.validate!(opts, name: __MODULE__)
16+
GenServer.start_link(__MODULE__, [], name: opts[:name])
1617
end
1718

1819
@doc ~S"""
@@ -47,9 +48,18 @@ defmodule PdfExtractor do
4748
"✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
4849
}}
4950
51+
Extract text from only some pages.
52+
53+
iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf", [0])
54+
{:ok,
55+
%{
56+
0 =>
57+
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
58+
}}
59+
5060
Extract only the titles in the book chapters.
5161
52-
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", [2, 8, 10], %{
62+
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
5363
...> 2 => {0, 0, 612, 190},
5464
...> 8 => {0, 0, 612, 190},
5565
...> 10 => {0, 0, 612, 190}
@@ -63,7 +73,7 @@ defmodule PdfExtractor do
6373
6474
Extract multiple areas from a single page.
6575
66-
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", 1, %{
76+
iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
6777
...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
6878
...> })
6979
{:ok,
@@ -74,8 +84,8 @@ defmodule PdfExtractor do
7484
]
7585
}}
7686
"""
77-
def extract_text(file_path, page_numbers \\ [], areas \\ %{}) do
78-
GenServer.call(__MODULE__, {:extract_text, [file_path, page_numbers, areas]})
87+
def extract_text(file_path, pages \\ []) do
88+
GenServer.call(__MODULE__, {:extract_text, [file_path, pages]})
7989
end
8090

8191
@doc ~S"""
@@ -99,11 +109,21 @@ defmodule PdfExtractor do
99109
"✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
100110
}}
101111
112+
Extract text from only some pages.
113+
114+
iex> content = File.read!("priv/fixtures/fatura.pdf")
115+
...> PdfExtractor.extract_text_from_binary(content, [0])
116+
{:ok,
117+
%{
118+
0 =>
119+
"Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
120+
}}
121+
102122
Extract only the titles in the book chapters.
103123
104124
iex> content = File.read!("priv/fixtures/book.pdf")
105125
...>
106-
...> PdfExtractor.extract_text_from_binary(content, [2, 8, 10], %{
126+
...> PdfExtractor.extract_text_from_binary(content, %{
107127
...> 2 => {0, 0, 612, 190},
108128
...> 8 => {0, 0, 612, 190},
109129
...> 10 => {0, 0, 612, 190}
@@ -119,7 +139,7 @@ defmodule PdfExtractor do
119139
120140
iex> content = File.read!("priv/fixtures/book.pdf")
121141
...>
122-
...> PdfExtractor.extract_text_from_binary(content, 1, %{
142+
...> PdfExtractor.extract_text_from_binary(content, %{
123143
...> 1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
124144
...> })
125145
{:ok,
@@ -131,8 +151,8 @@ defmodule PdfExtractor do
131151
}}
132152
133153
"""
134-
def extract_text_from_binary(binary, page_numbers \\ [], areas \\ %{}) do
135-
GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, page_numbers, areas]})
154+
def extract_text_from_binary(binary, pages \\ []) do
155+
GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, pages]})
136156
end
137157

138158
@doc """

lib/pdf_extractor/pdf_plumber.ex

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,42 @@ defmodule PdfExtractor.PdfPlumber do
1818

1919
@spec extract_text(
2020
file_path :: String.t(),
21-
page_number :: page() | list(page()),
22-
areas :: %{page() => area() | [area()] | nil}
21+
pages :: page() | list(page()) | %{page() => area() | [area()] | nil}
2322
) :: %{page() => String.t() | list(String.t())}
24-
def extract_text(file_path, page_number, areas) when is_integer(page_number) do
25-
extract_text(file_path, List.wrap(page_number), areas)
23+
def extract_text(file_path, page_number) when is_integer(page_number) do
24+
extract_text(file_path, List.wrap(page_number))
2625
end
2726

28-
def extract_text(file_path, page_numbers, areas) when is_list(page_numbers) and is_map(areas) do
27+
def extract_text(file_path, pages) when is_list(pages) do
2928
"""
3029
#{python_extract_code()}
3130
3231
main(file_path.decode('utf-8'), page_numbers, areas)
3332
"""
3433
|> Pythonx.eval(%{
3534
"file_path" => file_path,
36-
"page_numbers" => page_numbers,
37-
"areas" => areas
35+
"page_numbers" => pages,
36+
"areas" => %{}
3837
})
3938
|> elem(0)
4039
|> Pythonx.decode()
41-
|> to_map(page_numbers)
40+
|> to_map(pages)
41+
end
42+
43+
def extract_text(file_path, pages) when is_map(pages) do
44+
"""
45+
#{python_extract_code()}
46+
47+
main(file_path.decode('utf-8'), page_numbers, areas)
48+
"""
49+
|> Pythonx.eval(%{
50+
"file_path" => file_path,
51+
"page_numbers" => Map.keys(pages),
52+
"areas" => pages
53+
})
54+
|> elem(0)
55+
|> Pythonx.decode()
56+
|> to_map(Map.keys(pages))
4257
end
4358

4459
@doc """
@@ -47,11 +62,29 @@ defmodule PdfExtractor.PdfPlumber do
4762
url = "https://erlang.org/download/armstrong_thesis_2003.pdf"
4863
url |> :httpc.request() |> elem(1) |> elem(2) |> :binary.list_to_bin() |> PdfExtractor.extract_text_from_binary()
4964
"""
50-
def extract_text_from_binary(binary, page_number, areas) when is_integer(page_number) do
51-
extract_text_from_binary(binary, List.wrap(page_number), areas)
65+
def extract_text_from_binary(binary, page_number) when is_integer(page_number) do
66+
extract_text_from_binary(binary, List.wrap(page_number))
67+
end
68+
69+
def extract_text_from_binary(binary, pages) when is_list(pages) do
70+
"""
71+
from io import BytesIO
72+
73+
#{python_extract_code()}
74+
75+
main(BytesIO(binary), page_numbers, areas)
76+
"""
77+
|> Pythonx.eval(%{
78+
"binary" => binary,
79+
"page_numbers" => pages,
80+
"areas" => %{}
81+
})
82+
|> elem(0)
83+
|> Pythonx.decode()
84+
|> to_map(pages)
5285
end
5386

54-
def extract_text_from_binary(binary, page_numbers, areas) when is_list(page_numbers) and is_map(areas) do
87+
def extract_text_from_binary(binary, pages) when is_map(pages) do
5588
"""
5689
from io import BytesIO
5790
@@ -61,12 +94,12 @@ defmodule PdfExtractor.PdfPlumber do
6194
"""
6295
|> Pythonx.eval(%{
6396
"binary" => binary,
64-
"page_numbers" => page_numbers,
65-
"areas" => areas
97+
"page_numbers" => Map.keys(pages),
98+
"areas" => pages
6699
})
67100
|> elem(0)
68101
|> Pythonx.decode()
69-
|> to_map(page_numbers)
102+
|> to_map(Map.keys(pages))
70103
end
71104

72105
defp python_extract_code do

test/pdf_extractor_test.exs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ defmodule PdfExtractorTest do
6161
assert Map.keys(result) == [1]
6262
assert result[1] == @test_file_content[1]
6363

64-
assert {:ok, result} = PdfExtractor.extract_text(@test_file_path, [0, 1], %{1 => nil})
64+
assert {:ok, result} = PdfExtractor.extract_text(@test_file_path, [0, 1])
6565

6666
assert is_map(result)
6767
assert Map.keys(result) == [0, 1]
@@ -74,7 +74,7 @@ defmodule PdfExtractorTest do
7474
1 => [{0, 0, 300, 400}, {0, 270, 595, 840}]
7575
}
7676

77-
assert PdfExtractor.extract_text(@test_file_path, Map.keys(areas), areas) ==
77+
assert PdfExtractor.extract_text(@test_file_path, areas) ==
7878
{:ok,
7979
%{
8080
0 => "Text Example Bill\nProjeto de lei para:\nElixir Company",
@@ -143,7 +143,7 @@ defmodule PdfExtractorTest do
143143
assert result[1] == @test_file_content[1]
144144

145145
assert {:ok, result} =
146-
PdfExtractor.extract_text_from_binary(test_file_binary_content, [0, 1], %{1 => nil})
146+
PdfExtractor.extract_text_from_binary(test_file_binary_content, [0, 1])
147147

148148
assert is_map(result)
149149
assert Map.keys(result) == [0, 1]
@@ -158,11 +158,7 @@ defmodule PdfExtractorTest do
158158
1 => [{0, 0, 300, 400}, {0, 270, 595, 840}]
159159
}
160160

161-
assert PdfExtractor.extract_text_from_binary(
162-
test_file_binary_content,
163-
Map.keys(areas),
164-
areas
165-
) ==
161+
assert PdfExtractor.extract_text_from_binary(test_file_binary_content, areas) ==
166162
{:ok,
167163
%{
168164
0 => "Text Example Bill\nProjeto de lei para:\nElixir Company",

0 commit comments

Comments
 (0)