feat: simplify arguments to avoid repetition

nelsonmestevao · nelsonmestevao · commit 37837e8e4129 · 2025-08-23T16:03:27.000+01:00
diff --git a/README.md b/README.md
@@ -56,18 +56,17 @@ end
 
 ## Usage
 
-Extract text from specific regions using bounding boxes `[x0, y0, x1, y1]`:
+Extract text from specific regions using bounding boxes `{x0, y0, x1, y1}`:
 
 ```elixir
-pages = [0, 1, 2] # zero based index
 areas = %{
   0 => {0, 0, 300, 200},    # Top-left area of page 0
   1 => [
         {200, 300, 600, 500}, # Bottom-right area of page 1
         {0, 0, 200, 250}, # Top-left area of page 1
        ]
 }
-PdfExtractor.extract_text("path/to/document.pdf", pages, areas)
+PdfExtractor.extract_text("path/to/document.pdf", areas)
 ```
 
 ### Return Format
@@ -90,4 +89,3 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
 
 - Built on top of the excellent [pdfplumber](https://github.com/jsvine/pdfplumber) Python library
 - Uses [pythonx](https://github.com/livebook-dev/pythonx) for seamless Python integration
-
diff --git a/lib/pdf_extractor.ex b/lib/pdf_extractor.ex
@@ -11,8 +11,9 @@ defmodule PdfExtractor do
 
   # Client
 
-  def start_link([] = _opts \\ []) do
-    GenServer.start_link(__MODULE__, [], name: __MODULE__)
+  def start_link(opts \\ []) do
+    opts = Keyword.validate!(opts, name: __MODULE__)
+    GenServer.start_link(__MODULE__, [], name: opts[:name])
   end
 
   @doc ~S"""
@@ -47,9 +48,18 @@ defmodule PdfExtractor do
            "✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
        }}
 
+    Extract text from only some pages.
+
+      iex> PdfExtractor.extract_text("priv/fixtures/fatura.pdf", [0])
+      {:ok,
+       %{
+         0 =>
+           "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
+       }}
+
     Extract only the titles in the book chapters.
 
-      iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", [2, 8, 10], %{
+      iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
       ...>   2 => {0, 0, 612, 190},
       ...>   8 => {0, 0, 612, 190},
       ...>   10 => {0, 0, 612, 190}
@@ -63,7 +73,7 @@ defmodule PdfExtractor do
 
     Extract multiple areas from a single page.
 
-      iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", 1, %{
+      iex> PdfExtractor.extract_text("priv/fixtures/book.pdf", %{
       ...>   1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
       ...> })
       {:ok,
@@ -74,8 +84,8 @@ defmodule PdfExtractor do
          ]
        }}
   """
-  def extract_text(file_path, page_numbers \\ [], areas \\ %{}) do
-    GenServer.call(__MODULE__, {:extract_text, [file_path, page_numbers, areas]})
+  def extract_text(file_path, pages \\ []) do
+    GenServer.call(__MODULE__, {:extract_text, [file_path, pages]})
   end
 
   @doc ~S"""
@@ -99,11 +109,21 @@ defmodule PdfExtractor do
            "✂\nReceipt Payment part Account / Payable to\nCH4431999123000889012\n✂\nMax Muster & Söhne\nAccount / Payable to\nCH4431999123000889012 Musterstrasse 123\nMax Muster & Söhne 8000 Seldwyla\nMusterstrasse 123\n8000 Seldwyla\nReference\n210000000003139471430009017\nReference\n210000000003139471430009017\nAdditional information\nBestellung vom 15.10.2020\nPayable by (name/address)\nSimon Muster\nPayable by (name/address)\nMusterstrasse 1\nCurrency Amount\nSimon Muster\n8000 Seldwyla\nCHF 1 949.75 Musterstrasse 1\n8000 Seldwyla\nCurrency Amount\nCHF 1 949.75\nAcceptance point"
        }}
 
+    Extract text from only some pages.
+
+      iex> content = File.read!("priv/fixtures/fatura.pdf")
+      ...> PdfExtractor.extract_text_from_binary(content, [0])
+      {:ok,
+       %{
+         0 =>
+           "Text Example Bill FATURA\n# 2025010002\nData: Jun 21, 2025\nProjeto de lei para:\nSaldo devedor: 1 525,59 €\nElixir Company\nItem Quantidade Avaliar Quantia\nTrabalho 1 1 500,00 € 1 500,00 €\nMais trabalho 1 25,59 € 25,59 €\nSubtotal: 1 525,59 €\nImposto (0%): 0,00 €\nTotal: 1 525,59 €"
+       }}
+
     Extract only the titles in the book chapters.
 
       iex> content = File.read!("priv/fixtures/book.pdf")
       ...>
-      ...> PdfExtractor.extract_text_from_binary(content, [2, 8, 10], %{
+      ...> PdfExtractor.extract_text_from_binary(content, %{
       ...>   2 => {0, 0, 612, 190},
       ...>   8 => {0, 0, 612, 190},
       ...>   10 => {0, 0, 612, 190}
@@ -119,7 +139,7 @@ defmodule PdfExtractor do
 
       iex> content = File.read!("priv/fixtures/book.pdf")
       ...>
-      ...> PdfExtractor.extract_text_from_binary(content, 1, %{
+      ...> PdfExtractor.extract_text_from_binary(content, %{
       ...>   1 => [{0, 100, 612, 140}, {0, 400, 612, 440}]
       ...> })
       {:ok,
@@ -131,8 +151,8 @@ defmodule PdfExtractor do
        }}
 
   """
-  def extract_text_from_binary(binary, page_numbers \\ [], areas \\ %{}) do
-    GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, page_numbers, areas]})
+  def extract_text_from_binary(binary, pages \\ []) do
+    GenServer.call(__MODULE__, {:extract_text_from_binary, [binary, pages]})
   end
 
   @doc """
diff --git a/lib/pdf_extractor/pdf_plumber.ex b/lib/pdf_extractor/pdf_plumber.ex
@@ -18,27 +18,42 @@ defmodule PdfExtractor.PdfPlumber do
 
   @spec extract_text(
           file_path :: String.t(),
-          page_number :: page() | list(page()),
-          areas :: %{page() => area() | [area()] | nil}
+          pages :: page() | list(page()) | %{page() => area() | [area()] | nil}
         ) :: %{page() => String.t() | list(String.t())}
-  def extract_text(file_path, page_number, areas) when is_integer(page_number) do
-    extract_text(file_path, List.wrap(page_number), areas)
+  def extract_text(file_path, page_number) when is_integer(page_number) do
+    extract_text(file_path, List.wrap(page_number))
   end
 
-  def extract_text(file_path, page_numbers, areas) when is_list(page_numbers) and is_map(areas) do
+  def extract_text(file_path, pages) when is_list(pages) do
     """
     #{python_extract_code()}
 
     main(file_path.decode('utf-8'), page_numbers, areas)
     """
     |> Pythonx.eval(%{
       "file_path" => file_path,
-      "page_numbers" => page_numbers,
-      "areas" => areas
+      "page_numbers" => pages,
+      "areas" => %{}
     })
     |> elem(0)
     |> Pythonx.decode()
-    |> to_map(page_numbers)
+    |> to_map(pages)
+  end
+
+  def extract_text(file_path, pages) when is_map(pages) do
+    """
+    #{python_extract_code()}
+
+    main(file_path.decode('utf-8'), page_numbers, areas)
+    """
+    |> Pythonx.eval(%{
+      "file_path" => file_path,
+      "page_numbers" => Map.keys(pages),
+      "areas" => pages
+    })
+    |> elem(0)
+    |> Pythonx.decode()
+    |> to_map(Map.keys(pages))
   end
 
   @doc """
@@ -47,11 +62,29 @@ defmodule PdfExtractor.PdfPlumber do
     url = "https://erlang.org/download/armstrong_thesis_2003.pdf"
     url |> :httpc.request() |> elem(1) |> elem(2) |> :binary.list_to_bin() |> PdfExtractor.extract_text_from_binary()
   """
-  def extract_text_from_binary(binary, page_number, areas) when is_integer(page_number) do
-    extract_text_from_binary(binary, List.wrap(page_number), areas)
+  def extract_text_from_binary(binary, page_number) when is_integer(page_number) do
+    extract_text_from_binary(binary, List.wrap(page_number))
+  end
+
+  def extract_text_from_binary(binary, pages) when is_list(pages) do
+    """
+    from io import BytesIO
+
+    #{python_extract_code()}
+
+    main(BytesIO(binary), page_numbers, areas)
+    """
+    |> Pythonx.eval(%{
+      "binary" => binary,
+      "page_numbers" => pages,
+      "areas" => %{}
+    })
+    |> elem(0)
+    |> Pythonx.decode()
+    |> to_map(pages)
   end
 
-  def extract_text_from_binary(binary, page_numbers, areas) when is_list(page_numbers) and is_map(areas) do
+  def extract_text_from_binary(binary, pages) when is_map(pages) do
     """
     from io import BytesIO
 
@@ -61,12 +94,12 @@ defmodule PdfExtractor.PdfPlumber do
     """
     |> Pythonx.eval(%{
       "binary" => binary,
-      "page_numbers" => page_numbers,
-      "areas" => areas
+      "page_numbers" => Map.keys(pages),
+      "areas" => pages
     })
     |> elem(0)
     |> Pythonx.decode()
-    |> to_map(page_numbers)
+    |> to_map(Map.keys(pages))
   end
 
   defp python_extract_code do
diff --git a/test/pdf_extractor_test.exs b/test/pdf_extractor_test.exs
@@ -61,7 +61,7 @@ defmodule PdfExtractorTest do
       assert Map.keys(result) == [1]
       assert result[1] == @test_file_content[1]
 
-      assert {:ok, result} = PdfExtractor.extract_text(@test_file_path, [0, 1], %{1 => nil})
+      assert {:ok, result} = PdfExtractor.extract_text(@test_file_path, [0, 1])
 
       assert is_map(result)
       assert Map.keys(result) == [0, 1]
@@ -74,7 +74,7 @@ defmodule PdfExtractorTest do
         1 => [{0, 0, 300, 400}, {0, 270, 595, 840}]
       }
 
-      assert PdfExtractor.extract_text(@test_file_path, Map.keys(areas), areas) ==
+      assert PdfExtractor.extract_text(@test_file_path, areas) ==
                {:ok,
                 %{
                   0 => "Text Example Bill\nProjeto de lei para:\nElixir Company",
@@ -143,7 +143,7 @@ defmodule PdfExtractorTest do
       assert result[1] == @test_file_content[1]
 
       assert {:ok, result} =
-               PdfExtractor.extract_text_from_binary(test_file_binary_content, [0, 1], %{1 => nil})
+               PdfExtractor.extract_text_from_binary(test_file_binary_content, [0, 1])
 
       assert is_map(result)
       assert Map.keys(result) == [0, 1]
@@ -158,11 +158,7 @@ defmodule PdfExtractorTest do
         1 => [{0, 0, 300, 400}, {0, 270, 595, 840}]
       }
 
-      assert PdfExtractor.extract_text_from_binary(
-               test_file_binary_content,
-               Map.keys(areas),
-               areas
-             ) ==
+      assert PdfExtractor.extract_text_from_binary(test_file_binary_content, areas) ==
                {:ok,
                 %{
                   0 => "Text Example Bill\nProjeto de lei para:\nElixir Company",