Skip to content

Commit aa4d007

Browse files
Refactor/ingestion (#209)
Co-authored-by: Filip Christiansen <[email protected]>
1 parent 42e4e26 commit aa4d007

34 files changed

+1199
-1487
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,3 +173,6 @@ Caddyfile
173173

174174
# ignore default output directory
175175
tmp/*
176+
177+
# Gitingest
178+
digest.txt

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ repos:
9595
files: ^src/
9696
additional_dependencies:
9797
[
98+
chardet,
9899
click,
99100
fastapi-analytics,
100101
pytest-asyncio,
@@ -112,6 +113,7 @@ repos:
112113
- --rcfile=tests/.pylintrc
113114
additional_dependencies:
114115
[
116+
chardet,
115117
click,
116118
fastapi-analytics,
117119
pytest,

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ Gitingest aims to be friendly for first time contributors, with a simple Python
142142
- [tiktoken](https://github.com/openai/tiktoken) - Token estimation
143143
- [posthog](https://github.com/PostHog/posthog) - Amazing analytics
144144

145-
### Looking for a JavaScript/Node package?
145+
### Looking for a JavaScript/FileSystemNode package?
146146

147147
Check out the NPM alternative 📦 Repomix: <https://github.com/yamadashy/repomix>
148148

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
[project]
22
name = "gitingest"
3-
version = "0.1.3"
3+
version = "0.1.4"
44
description="CLI tool to analyze and create text dumps of codebases for LLMs"
55
readme = {file = "README.md", content-type = "text/markdown" }
66
requires-python = ">= 3.8"
77
dependencies = [
88
"click>=8.0.0",
99
"tiktoken",
10+
"tomli",
1011
"typing_extensions; python_version < '3.10'",
1112
]
1213

@@ -52,6 +53,7 @@ disable = [
5253
"too-few-public-methods",
5354
"broad-exception-caught",
5455
"duplicate-code",
56+
"fixme",
5557
]
5658

5759
[tool.pycln]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
chardet
12
click>=8.0.0
23
fastapi[standard]
34
python-dotenv

src/gitingest/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
""" Gitingest: A package for ingesting data from Git repositories. """
22

3-
from gitingest.query_ingestion import run_ingest_query
4-
from gitingest.query_parser import parse_query
5-
from gitingest.repository_clone import clone_repo
3+
from gitingest.cloning import clone_repo
4+
from gitingest.ingestion import ingest_query
5+
from gitingest.query_parsing import parse_query
66
from gitingest.repository_ingest import ingest, ingest_async
77

8-
__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]
8+
__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"]

src/gitingest/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import click
99

10-
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH
10+
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
1111
from gitingest.repository_ingest import ingest_async
1212

1313

@@ -92,15 +92,15 @@ async def _async_main(
9292
include_patterns = set(include_pattern)
9393

9494
if not output:
95-
output = OUTPUT_FILE_PATH
95+
output = OUTPUT_FILE_NAME
9696
summary, _, _ = await ingest_async(source, max_size, include_patterns, exclude_patterns, branch, output=output)
9797

9898
click.echo(f"Analysis complete! Output written to: {output}")
9999
click.echo("\nSummary:")
100100
click.echo(summary)
101101

102-
except Exception as e:
103-
click.echo(f"Error: {e}", err=True)
102+
except Exception as exc:
103+
click.echo(f"Error: {exc}", err=True)
104104
raise click.Abort()
105105

106106

src/gitingest/repository_clone.py renamed to src/gitingest/cloning.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pathlib import Path
77
from typing import List, Optional, Tuple
88

9-
from gitingest.utils import async_timeout
9+
from gitingest.utils.timeout_wrapper import async_timeout
1010

1111
TIMEOUT: int = 60
1212

@@ -38,6 +38,7 @@ class CloneConfig:
3838
commit: Optional[str] = None
3939
branch: Optional[str] = None
4040
subpath: str = "/"
41+
blob: bool = False
4142

4243

4344
@async_timeout(TIMEOUT)
@@ -72,14 +73,15 @@ async def clone_repo(config: CloneConfig) -> None:
7273
parent_dir = Path(local_path).parent
7374
try:
7475
os.makedirs(parent_dir, exist_ok=True)
75-
except OSError as e:
76-
raise OSError(f"Failed to create parent directory {parent_dir}: {e}") from e
76+
except OSError as exc:
77+
raise OSError(f"Failed to create parent directory {parent_dir}: {exc}") from exc
7778

7879
# Check if the repository exists
7980
if not await _check_repo_exists(url):
8081
raise ValueError("Repository not found, make sure it is public")
8182

82-
clone_cmd = ["git", "clone", "--recurse-submodules", "--single-branch"]
83+
clone_cmd = ["git", "clone", "--single-branch"]
84+
# TODO re-enable --recurse-submodules
8385

8486
if partial_clone:
8587
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -98,7 +100,10 @@ async def clone_repo(config: CloneConfig) -> None:
98100
checkout_cmd = ["git", "-C", local_path]
99101

100102
if partial_clone:
101-
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
103+
if config.blob:
104+
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")[:-1]]
105+
else:
106+
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
102107

103108
if commit:
104109
checkout_cmd += ["checkout", commit]
@@ -149,7 +154,6 @@ async def _check_repo_exists(url: str) -> bool:
149154
raise RuntimeError(f"Unexpected status code: {status_code}")
150155

151156

152-
@async_timeout(TIMEOUT)
153157
async def fetch_remote_branch_list(url: str) -> List[str]:
154158
"""
155159
Fetch the list of branches from a remote Git repository.

src/gitingest/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88
MAX_FILES = 10_000 # Maximum number of files to process
99
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB
1010

11-
OUTPUT_FILE_PATH = "digest.txt"
11+
OUTPUT_FILE_NAME = "digest.txt"
1212

1313
TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest"

src/gitingest/filesystem_schema.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
""" Define the schema for the filesystem representation. """
2+
3+
from __future__ import annotations
4+
5+
import os
6+
from dataclasses import dataclass, field
7+
from enum import Enum, auto
8+
from pathlib import Path
9+
10+
from gitingest.exceptions import InvalidNotebookError
11+
from gitingest.utils.ingestion_utils import _get_encoding_list
12+
from gitingest.utils.notebook_utils import process_notebook
13+
from gitingest.utils.textfile_checker_utils import is_textfile
14+
15+
SEPARATOR = "=" * 48 + "\n"
16+
17+
18+
class FileSystemNodeType(Enum):
19+
"""Enum representing the type of a file system node (directory or file)."""
20+
21+
DIRECTORY = auto()
22+
FILE = auto()
23+
24+
25+
@dataclass
26+
class FileSystemStats:
27+
"""Class for tracking statistics during file system traversal."""
28+
29+
visited: set[Path] = field(default_factory=set)
30+
total_files: int = 0
31+
total_size: int = 0
32+
33+
34+
@dataclass
35+
class FileSystemNode: # pylint: disable=too-many-instance-attributes
36+
"""
37+
Class representing a node in the file system (either a file or directory).
38+
39+
This class has more than the recommended number of attributes because it needs to
40+
track various properties of files and directories for comprehensive analysis.
41+
"""
42+
43+
name: str
44+
type: FileSystemNodeType # e.g., "directory" or "file"
45+
path_str: str
46+
path: Path
47+
size: int = 0
48+
file_count: int = 0
49+
dir_count: int = 0
50+
depth: int = 0
51+
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list
52+
53+
def sort_children(self) -> None:
54+
"""
55+
Sort the children nodes of a directory according to a specific order.
56+
57+
Order of sorting:
58+
1. README.md first
59+
2. Regular files (not starting with dot)
60+
3. Hidden files (starting with dot)
61+
4. Regular directories (not starting with dot)
62+
5. Hidden directories (starting with dot)
63+
All groups are sorted alphanumerically within themselves.
64+
"""
65+
# Separate files and directories
66+
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
67+
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]
68+
69+
# Find README.md
70+
readme_files = [f for f in files if f.name.lower() == "readme.md"]
71+
other_files = [f for f in files if f.name.lower() != "readme.md"]
72+
73+
# Separate hidden and regular files/directories
74+
regular_files = [f for f in other_files if not f.name.startswith(".")]
75+
hidden_files = [f for f in other_files if f.name.startswith(".")]
76+
regular_dirs = [d for d in directories if not d.name.startswith(".")]
77+
hidden_dirs = [d for d in directories if d.name.startswith(".")]
78+
79+
# Sort each group alphanumerically
80+
regular_files.sort(key=lambda x: x.name)
81+
hidden_files.sort(key=lambda x: x.name)
82+
regular_dirs.sort(key=lambda x: x.name)
83+
hidden_dirs.sort(key=lambda x: x.name)
84+
85+
self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs
86+
87+
@property
88+
def content_string(self) -> str:
89+
"""
90+
Return the content of the node as a string.
91+
92+
This property returns the content of the node as a string, including the path and content.
93+
94+
Returns
95+
-------
96+
str
97+
A string representation of the node's content.
98+
"""
99+
content_repr = SEPARATOR
100+
101+
# Use forward slashes in output paths
102+
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
103+
content_repr += SEPARATOR
104+
content_repr += f"{self.content}\n\n"
105+
return content_repr
106+
107+
@property
108+
def content(self) -> str: # pylint: disable=too-many-return-statements
109+
"""
110+
Read the content of a file.
111+
112+
This function attempts to open a file and read its contents using UTF-8 encoding.
113+
If an error occurs during reading (e.g., file is not found or permission error),
114+
it returns an error message.
115+
116+
Returns
117+
-------
118+
str
119+
The content of the file, or an error message if the file could not be read.
120+
"""
121+
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
122+
return "[Non-text file]"
123+
124+
try:
125+
if self.path.suffix == ".ipynb":
126+
try:
127+
return process_notebook(self.path)
128+
except Exception as exc:
129+
return f"Error processing notebook: {exc}"
130+
131+
for encoding in _get_encoding_list():
132+
try:
133+
with self.path.open(encoding=encoding) as f:
134+
return f.read()
135+
except UnicodeDecodeError:
136+
continue
137+
except OSError as exc:
138+
return f"Error reading file: {exc}"
139+
140+
return "Error: Unable to decode file with available encodings"
141+
142+
except (OSError, InvalidNotebookError) as exc:
143+
return f"Error reading file: {exc}"

0 commit comments

Comments
 (0)