mirror of
https://github.com/frej/fast-export.git
synced 2026-02-08 14:16:46 +01:00
Converts large Mercurial repositories to Git/LFS significantly faster by integrating the LFS conversion into the history export process. Currently, converting large repositories requires two sequential, long-running steps: 1. Full history conversion (`hg` to `git`). 2. Full history rewrite/import (`git lfs import`). For huge monorepos (100GiB+, 1M+ files), this sequence can take hours or days. This commit introduces a new plugin that allows the repository to be converted *incrementally* (JIT: Just-In-Time). The plugin identifies large files during the initial `hg` to `git` conversion and immediately writes LFS pointers, eliminating the need for the second, time-consuming history rewrite step.
50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
import pathlib
|
|
import hashlib
|
|
import pathspec
|
|
|
|
|
|
def build_filter(args):
|
|
with open(args) as f:
|
|
lfs_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, f)
|
|
return Filter(lfs_spec)
|
|
|
|
|
|
class Filter:
|
|
def __init__(self, lfs_spec):
|
|
self.lfs_spec = lfs_spec
|
|
|
|
def file_data_filter(self, file_data):
|
|
"""
|
|
file_data: {
|
|
'filename': <str>,
|
|
'file_ctx': <mercurial.filectx or None>,
|
|
'data': <bytes or None>,
|
|
'is_largefile': <bool>
|
|
}
|
|
|
|
May be called for deletions (data=None, file_ctx=None).
|
|
"""
|
|
filename = file_data.get('filename')
|
|
data = file_data.get('data')
|
|
|
|
# Skip deletions or filtered files early
|
|
if data is None or not self.lfs_spec.match_file(filename.decode("utf-8")):
|
|
return
|
|
|
|
# Get the file path
|
|
sha256hash = hashlib.sha256(data).hexdigest()
|
|
lfs_path = pathlib.Path(f".git/lfs/objects/{sha256hash[0:2]}/{sha256hash[2:4]}")
|
|
lfs_path.mkdir(parents=True, exist_ok=True)
|
|
lfs_file_path = lfs_path / sha256hash
|
|
|
|
# The binary blob is already in LFS
|
|
if not lfs_file_path.is_file():
|
|
(lfs_path / sha256hash).write_bytes(data)
|
|
|
|
# Write the LFS pointer
|
|
file_data['data'] = (
|
|
f"version https://git-lfs.github.com/spec/v1\n"
|
|
f"oid sha256:{sha256hash}\n"
|
|
f"size {len(data)}\n"
|
|
).encode("utf-8")
|