This commit is contained in:
HamsterMimi
2023-05-04 13:09:03 +08:00
commit 189df25fd3
207 changed files with 242887 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Initialize the object database module"""
import sys
import os
#{ Initialization
def _init_externals():
"""Initialize external projects by putting them into the path"""
if 'PYOXIDIZER' not in os.environ:
where = os.path.join(os.path.dirname(__file__), 'ext', 'smmap')
if os.path.exists(where):
sys.path.append(where)
import smmap
del smmap
# END handle imports
#} END initialization
_init_externals()
__author__ = "Sebastian Thiel"
__contact__ = "byronimo@gmail.com"
__homepage__ = "https://github.com/gitpython-developers/gitdb"
version_info = (4, 0, 10)
__version__ = '.'.join(str(i) for i in version_info)
# default imports
from gitdb.base import *
from gitdb.db import *
from gitdb.stream import *

View File

@@ -0,0 +1,315 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module with basic data structures - they are designed to be lightweight and fast"""
from gitdb.util import bin_to_hex
from gitdb.fun import (
type_id_to_type_map,
type_to_type_id_map
)
__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo',
'OStream', 'OPackStream', 'ODeltaPackStream',
'IStream', 'InvalidOInfo', 'InvalidOStream')
#{ ODB Bases
class OInfo(tuple):
"""Carries information about an object in an ODB, providing information
about the binary sha of the object, the type_string as well as the uncompressed size
in bytes.
It can be accessed using tuple notation and using attribute access notation::
assert dbi[0] == dbi.binsha
assert dbi[1] == dbi.type
assert dbi[2] == dbi.size
The type is designed to be as lightweight as possible."""
__slots__ = tuple()
def __new__(cls, sha, type, size):
return tuple.__new__(cls, (sha, type, size))
def __init__(self, *args):
tuple.__init__(self)
#{ Interface
@property
def binsha(self):
""":return: our sha as binary, 20 bytes"""
return self[0]
@property
def hexsha(self):
""":return: our sha, hex encoded, 40 bytes"""
return bin_to_hex(self[0])
@property
def type(self):
return self[1]
@property
def type_id(self):
return type_to_type_id_map[self[1]]
@property
def size(self):
return self[2]
#} END interface
class OPackInfo(tuple):
"""As OInfo, but provides a type_id property to retrieve the numerical type id, and
does not include a sha.
Additionally, the pack_offset is the absolute offset into the packfile at which
all object information is located. The data_offset property points to the absolute
location in the pack at which that actual data stream can be found."""
__slots__ = tuple()
def __new__(cls, packoffset, type, size):
return tuple.__new__(cls, (packoffset, type, size))
def __init__(self, *args):
tuple.__init__(self)
#{ Interface
@property
def pack_offset(self):
return self[0]
@property
def type(self):
return type_id_to_type_map[self[1]]
@property
def type_id(self):
return self[1]
@property
def size(self):
return self[2]
#} END interface
class ODeltaPackInfo(OPackInfo):
"""Adds delta specific information,
Either the 20 byte sha which points to some object in the database,
or the negative offset from the pack_offset, so that pack_offset - delta_info yields
the pack offset of the base object"""
__slots__ = tuple()
def __new__(cls, packoffset, type, size, delta_info):
return tuple.__new__(cls, (packoffset, type, size, delta_info))
#{ Interface
@property
def delta_info(self):
return self[3]
#} END interface
class OStream(OInfo):
"""Base for object streams retrieved from the database, providing additional
information about the stream.
Generally, ODB streams are read-only as objects are immutable"""
__slots__ = tuple()
def __new__(cls, sha, type, size, stream, *args, **kwargs):
"""Helps with the initialization of subclasses"""
return tuple.__new__(cls, (sha, type, size, stream))
def __init__(self, *args, **kwargs):
tuple.__init__(self)
#{ Stream Reader Interface
def read(self, size=-1):
return self[3].read(size)
@property
def stream(self):
return self[3]
#} END stream reader interface
class ODeltaStream(OStream):
"""Uses size info of its stream, delaying reads"""
def __new__(cls, sha, type, size, stream, *args, **kwargs):
"""Helps with the initialization of subclasses"""
return tuple.__new__(cls, (sha, type, size, stream))
#{ Stream Reader Interface
@property
def size(self):
return self[3].size
#} END stream reader interface
class OPackStream(OPackInfo):
"""Next to pack object information, a stream outputting an undeltified base object
is provided"""
__slots__ = tuple()
def __new__(cls, packoffset, type, size, stream, *args):
"""Helps with the initialization of subclasses"""
return tuple.__new__(cls, (packoffset, type, size, stream))
#{ Stream Reader Interface
def read(self, size=-1):
return self[3].read(size)
@property
def stream(self):
return self[3]
#} END stream reader interface
class ODeltaPackStream(ODeltaPackInfo):
"""Provides a stream outputting the uncompressed offset delta information"""
__slots__ = tuple()
def __new__(cls, packoffset, type, size, delta_info, stream):
return tuple.__new__(cls, (packoffset, type, size, delta_info, stream))
#{ Stream Reader Interface
def read(self, size=-1):
return self[4].read(size)
@property
def stream(self):
return self[4]
#} END stream reader interface
class IStream(list):
"""Represents an input content stream to be fed into the ODB. It is mutable to allow
the ODB to record information about the operations outcome right in this instance.
It provides interfaces for the OStream and a StreamReader to allow the instance
to blend in without prior conversion.
The only method your content stream must support is 'read'"""
__slots__ = tuple()
def __new__(cls, type, size, stream, sha=None):
return list.__new__(cls, (sha, type, size, stream, None))
def __init__(self, type, size, stream, sha=None):
list.__init__(self, (sha, type, size, stream, None))
#{ Interface
@property
def hexsha(self):
""":return: our sha, hex encoded, 40 bytes"""
return bin_to_hex(self[0])
def _error(self):
""":return: the error that occurred when processing the stream, or None"""
return self[4]
def _set_error(self, exc):
"""Set this input stream to the given exc, may be None to reset the error"""
self[4] = exc
error = property(_error, _set_error)
#} END interface
#{ Stream Reader Interface
def read(self, size=-1):
"""Implements a simple stream reader interface, passing the read call on
to our internal stream"""
return self[3].read(size)
#} END stream reader interface
#{ interface
def _set_binsha(self, binsha):
self[0] = binsha
def _binsha(self):
return self[0]
binsha = property(_binsha, _set_binsha)
def _type(self):
return self[1]
def _set_type(self, type):
self[1] = type
type = property(_type, _set_type)
def _size(self):
return self[2]
def _set_size(self, size):
self[2] = size
size = property(_size, _set_size)
def _stream(self):
return self[3]
def _set_stream(self, stream):
self[3] = stream
stream = property(_stream, _set_stream)
#} END odb info interface
class InvalidOInfo(tuple):
"""Carries information about a sha identifying an object which is invalid in
the queried database. The exception attribute provides more information about
the cause of the issue"""
__slots__ = tuple()
def __new__(cls, sha, exc):
return tuple.__new__(cls, (sha, exc))
def __init__(self, sha, exc):
tuple.__init__(self, (sha, exc))
@property
def binsha(self):
return self[0]
@property
def hexsha(self):
return bin_to_hex(self[0])
@property
def error(self):
""":return: exception instance explaining the failure"""
return self[1]
class InvalidOStream(InvalidOInfo):
"""Carries information about an invalid ODB stream"""
__slots__ = tuple()
#} END ODB Bases

View File

@@ -0,0 +1,4 @@
BYTE_SPACE = b' '
NULL_BYTE = b'\0'
NULL_HEX_SHA = "0" * 40
NULL_BIN_SHA = NULL_BYTE * 20

View File

@@ -0,0 +1,11 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
from gitdb.db.base import *
from gitdb.db.loose import *
from gitdb.db.mem import *
from gitdb.db.pack import *
from gitdb.db.git import *
from gitdb.db.ref import *

View File

@@ -0,0 +1,278 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Contains implementations of database retrieveing objects"""
from gitdb.util import (
join,
LazyMixin,
hex_to_bin
)
from gitdb.utils.encoding import force_text
from gitdb.exc import (
BadObject,
AmbiguousObjectName
)
from itertools import chain
from functools import reduce
__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB')
class ObjectDBR:
"""Defines an interface for object database lookup.
Objects are identified either by their 20 byte bin sha"""
def __contains__(self, sha):
return self.has_obj
#{ Query Interface
def has_object(self, sha):
"""
Whether the object identified by the given 20 bytes
binary sha is contained in the database
:return: True if the object identified by the given 20 bytes
binary sha is contained in the database"""
raise NotImplementedError("To be implemented in subclass")
def info(self, sha):
""" :return: OInfo instance
:param sha: bytes binary sha
:raise BadObject:"""
raise NotImplementedError("To be implemented in subclass")
def stream(self, sha):
""":return: OStream instance
:param sha: 20 bytes binary sha
:raise BadObject:"""
raise NotImplementedError("To be implemented in subclass")
def size(self):
""":return: amount of objects in this database"""
raise NotImplementedError()
def sha_iter(self):
"""Return iterator yielding 20 byte shas for all objects in this data base"""
raise NotImplementedError()
#} END query interface
class ObjectDBW:
"""Defines an interface to create objects in the database"""
def __init__(self, *args, **kwargs):
self._ostream = None
#{ Edit Interface
def set_ostream(self, stream):
"""
Adjusts the stream to which all data should be sent when storing new objects
:param stream: if not None, the stream to use, if None the default stream
will be used.
:return: previously installed stream, or None if there was no override
:raise TypeError: if the stream doesn't have the supported functionality"""
cstream = self._ostream
self._ostream = stream
return cstream
def ostream(self):
"""
Return the output stream
:return: overridden output stream this instance will write to, or None
if it will write to the default stream"""
return self._ostream
def store(self, istream):
"""
Create a new object in the database
:return: the input istream object with its sha set to its corresponding value
:param istream: IStream compatible instance. If its sha is already set
to a value, the object will just be stored in the our database format,
in which case the input stream is expected to be in object format ( header + contents ).
:raise IOError: if data could not be written"""
raise NotImplementedError("To be implemented in subclass")
#} END edit interface
class FileDBBase:
"""Provides basic facilities to retrieve files of interest, including
caching facilities to help mapping hexsha's to objects"""
def __init__(self, root_path):
"""Initialize this instance to look for its files at the given root path
All subsequent operations will be relative to this path
:raise InvalidDBRoot:
**Note:** The base will not perform any accessablity checking as the base
might not yet be accessible, but become accessible before the first
access."""
super().__init__()
self._root_path = root_path
#{ Interface
def root_path(self):
""":return: path at which this db operates"""
return self._root_path
def db_path(self, rela_path):
"""
:return: the given relative path relative to our database root, allowing
to pontentially access datafiles"""
return join(self._root_path, force_text(rela_path))
#} END interface
class CachingDB:
"""A database which uses caches to speed-up access"""
#{ Interface
def update_cache(self, force=False):
"""
Call this method if the underlying data changed to trigger an update
of the internal caching structures.
:param force: if True, the update must be performed. Otherwise the implementation
may decide not to perform an update if it thinks nothing has changed.
:return: True if an update was performed as something change indeed"""
# END interface
def _databases_recursive(database, output):
"""Fill output list with database from db, in order. Deals with Loose, Packed
and compound databases."""
if isinstance(database, CompoundDB):
dbs = database.databases()
output.extend(db for db in dbs if not isinstance(db, CompoundDB))
for cdb in (db for db in dbs if isinstance(db, CompoundDB)):
_databases_recursive(cdb, output)
else:
output.append(database)
# END handle database type
class CompoundDB(ObjectDBR, LazyMixin, CachingDB):
"""A database which delegates calls to sub-databases.
Databases are stored in the lazy-loaded _dbs attribute.
Define _set_cache_ to update it with your databases"""
def _set_cache_(self, attr):
if attr == '_dbs':
self._dbs = list()
elif attr == '_db_cache':
self._db_cache = dict()
else:
super()._set_cache_(attr)
def _db_query(self, sha):
""":return: database containing the given 20 byte sha
:raise BadObject:"""
# most databases use binary representations, prevent converting
# it every time a database is being queried
try:
return self._db_cache[sha]
except KeyError:
pass
# END first level cache
for db in self._dbs:
if db.has_object(sha):
self._db_cache[sha] = db
return db
# END for each database
raise BadObject(sha)
#{ ObjectDBR interface
def has_object(self, sha):
try:
self._db_query(sha)
return True
except BadObject:
return False
# END handle exceptions
def info(self, sha):
return self._db_query(sha).info(sha)
def stream(self, sha):
return self._db_query(sha).stream(sha)
def size(self):
""":return: total size of all contained databases"""
return reduce(lambda x, y: x + y, (db.size() for db in self._dbs), 0)
def sha_iter(self):
return chain(*(db.sha_iter() for db in self._dbs))
#} END object DBR Interface
#{ Interface
def databases(self):
""":return: tuple of database instances we use for lookups"""
return tuple(self._dbs)
def update_cache(self, force=False):
# something might have changed, clear everything
self._db_cache.clear()
stat = False
for db in self._dbs:
if isinstance(db, CachingDB):
stat |= db.update_cache(force)
# END if is caching db
# END for each database to update
return stat
def partial_to_complete_sha_hex(self, partial_hexsha):
"""
:return: 20 byte binary sha1 from the given less-than-40 byte hexsha (bytes or str)
:param partial_hexsha: hexsha with less than 40 byte
:raise AmbiguousObjectName: """
databases = list()
_databases_recursive(self, databases)
partial_hexsha = force_text(partial_hexsha)
len_partial_hexsha = len(partial_hexsha)
if len_partial_hexsha % 2 != 0:
partial_binsha = hex_to_bin(partial_hexsha + "0")
else:
partial_binsha = hex_to_bin(partial_hexsha)
# END assure successful binary conversion
candidate = None
for db in databases:
full_bin_sha = None
try:
if hasattr(db, 'partial_to_complete_sha_hex'):
full_bin_sha = db.partial_to_complete_sha_hex(partial_hexsha)
else:
full_bin_sha = db.partial_to_complete_sha(partial_binsha, len_partial_hexsha)
# END handle database type
except BadObject:
continue
# END ignore bad objects
if full_bin_sha:
if candidate and candidate != full_bin_sha:
raise AmbiguousObjectName(partial_hexsha)
candidate = full_bin_sha
# END handle candidate
# END for each db
if not candidate:
raise BadObject(partial_binsha)
return candidate
#} END interface

View File

@@ -0,0 +1,85 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
from gitdb.db.base import (
CompoundDB,
ObjectDBW,
FileDBBase
)
from gitdb.db.loose import LooseObjectDB
from gitdb.db.pack import PackedDB
from gitdb.db.ref import ReferenceDB
from gitdb.exc import InvalidDBRoot
import os
__all__ = ('GitDB', )
class GitDB(FileDBBase, ObjectDBW, CompoundDB):
"""A git-style object database, which contains all objects in the 'objects'
subdirectory
``IMPORTANT``: The usage of this implementation is highly discouraged as it fails to release file-handles.
This can be a problem with long-running processes and/or big repositories.
"""
# Configuration
PackDBCls = PackedDB
LooseDBCls = LooseObjectDB
ReferenceDBCls = ReferenceDB
# Directories
packs_dir = 'pack'
loose_dir = ''
alternates_dir = os.path.join('info', 'alternates')
def __init__(self, root_path):
"""Initialize ourselves on a git objects directory"""
super().__init__(root_path)
def _set_cache_(self, attr):
if attr == '_dbs' or attr == '_loose_db':
self._dbs = list()
loose_db = None
for subpath, dbcls in ((self.packs_dir, self.PackDBCls),
(self.loose_dir, self.LooseDBCls),
(self.alternates_dir, self.ReferenceDBCls)):
path = self.db_path(subpath)
if os.path.exists(path):
self._dbs.append(dbcls(path))
if dbcls is self.LooseDBCls:
loose_db = self._dbs[-1]
# END remember loose db
# END check path exists
# END for each db type
# should have at least one subdb
if not self._dbs:
raise InvalidDBRoot(self.root_path())
# END handle error
# we the first one should have the store method
assert loose_db is not None and hasattr(loose_db, 'store'), "First database needs store functionality"
# finally set the value
self._loose_db = loose_db
else:
super()._set_cache_(attr)
# END handle attrs
#{ ObjectDBW interface
def store(self, istream):
return self._loose_db.store(istream)
def ostream(self):
return self._loose_db.ostream()
def set_ostream(self, ostream):
return self._loose_db.set_ostream(ostream)
#} END objectdbw interface

View File

@@ -0,0 +1,258 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
from gitdb.db.base import (
FileDBBase,
ObjectDBR,
ObjectDBW
)
from gitdb.exc import (
BadObject,
AmbiguousObjectName
)
from gitdb.stream import (
DecompressMemMapReader,
FDCompressedSha1Writer,
FDStream,
Sha1Writer
)
from gitdb.base import (
OStream,
OInfo
)
from gitdb.util import (
file_contents_ro_filepath,
ENOENT,
hex_to_bin,
bin_to_hex,
exists,
chmod,
isdir,
isfile,
remove,
mkdir,
rename,
dirname,
basename,
join
)
from gitdb.fun import (
chunk_size,
loose_object_header_info,
write_object,
stream_copy
)
from gitdb.utils.encoding import force_bytes
import tempfile
import os
import sys
__all__ = ('LooseObjectDB', )
class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW):
"""A database which operates on loose object files"""
# CONFIGURATION
# chunks in which data will be copied between streams
stream_chunk_size = chunk_size
# On windows we need to keep it writable, otherwise it cannot be removed
# either
new_objects_mode = int("444", 8)
if os.name == 'nt':
new_objects_mode = int("644", 8)
def __init__(self, root_path):
super().__init__(root_path)
self._hexsha_to_file = dict()
# Additional Flags - might be set to 0 after the first failure
# Depending on the root, this might work for some mounts, for others not, which
# is why it is per instance
self._fd_open_flags = getattr(os, 'O_NOATIME', 0)
#{ Interface
def object_path(self, hexsha):
"""
:return: path at which the object with the given hexsha would be stored,
relative to the database root"""
return join(hexsha[:2], hexsha[2:])
def readable_db_object_path(self, hexsha):
"""
:return: readable object path to the object identified by hexsha
:raise BadObject: If the object file does not exist"""
try:
return self._hexsha_to_file[hexsha]
except KeyError:
pass
# END ignore cache misses
# try filesystem
path = self.db_path(self.object_path(hexsha))
if exists(path):
self._hexsha_to_file[hexsha] = path
return path
# END handle cache
raise BadObject(hexsha)
def partial_to_complete_sha_hex(self, partial_hexsha):
""":return: 20 byte binary sha1 string which matches the given name uniquely
:param name: hexadecimal partial name (bytes or ascii string)
:raise AmbiguousObjectName:
:raise BadObject: """
candidate = None
for binsha in self.sha_iter():
if bin_to_hex(binsha).startswith(force_bytes(partial_hexsha)):
# it can't ever find the same object twice
if candidate is not None:
raise AmbiguousObjectName(partial_hexsha)
candidate = binsha
# END for each object
if candidate is None:
raise BadObject(partial_hexsha)
return candidate
#} END interface
def _map_loose_object(self, sha):
"""
:return: memory map of that file to allow random read access
:raise BadObject: if object could not be located"""
db_path = self.db_path(self.object_path(bin_to_hex(sha)))
try:
return file_contents_ro_filepath(db_path, flags=self._fd_open_flags)
except OSError as e:
if e.errno != ENOENT:
# try again without noatime
try:
return file_contents_ro_filepath(db_path)
except OSError as new_e:
raise BadObject(sha) from new_e
# didn't work because of our flag, don't try it again
self._fd_open_flags = 0
else:
raise BadObject(sha) from e
# END handle error
# END exception handling
def set_ostream(self, stream):
""":raise TypeError: if the stream does not support the Sha1Writer interface"""
if stream is not None and not isinstance(stream, Sha1Writer):
raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__)
return super().set_ostream(stream)
def info(self, sha):
m = self._map_loose_object(sha)
try:
typ, size = loose_object_header_info(m)
return OInfo(sha, typ, size)
finally:
if hasattr(m, 'close'):
m.close()
# END assure release of system resources
def stream(self, sha):
m = self._map_loose_object(sha)
type, size, stream = DecompressMemMapReader.new(m, close_on_deletion=True)
return OStream(sha, type, size, stream)
def has_object(self, sha):
try:
self.readable_db_object_path(bin_to_hex(sha))
return True
except BadObject:
return False
# END check existence
def store(self, istream):
"""note: The sha we produce will be hex by nature"""
tmp_path = None
writer = self.ostream()
if writer is None:
# open a tmp file to write the data to
fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path)
if istream.binsha is None:
writer = FDCompressedSha1Writer(fd)
else:
writer = FDStream(fd)
# END handle direct stream copies
# END handle custom writer
try:
try:
if istream.binsha is not None:
# copy as much as possible, the actual uncompressed item size might
# be smaller than the compressed version
stream_copy(istream.read, writer.write, sys.maxsize, self.stream_chunk_size)
else:
# write object with header, we have to make a new one
write_object(istream.type, istream.size, istream.read, writer.write,
chunk_size=self.stream_chunk_size)
# END handle direct stream copies
finally:
if tmp_path:
writer.close()
# END assure target stream is closed
except:
if tmp_path:
os.remove(tmp_path)
raise
# END assure tmpfile removal on error
hexsha = None
if istream.binsha:
hexsha = istream.hexsha
else:
hexsha = writer.sha(as_hex=True)
# END handle sha
if tmp_path:
obj_path = self.db_path(self.object_path(hexsha))
obj_dir = dirname(obj_path)
if not isdir(obj_dir):
mkdir(obj_dir)
# END handle destination directory
# rename onto existing doesn't work on NTFS
if isfile(obj_path):
remove(tmp_path)
else:
rename(tmp_path, obj_path)
# end rename only if needed
# make sure its readable for all ! It started out as rw-- tmp file
# but needs to be rwrr
chmod(obj_path, self.new_objects_mode)
# END handle dry_run
istream.binsha = hex_to_bin(hexsha)
return istream
def sha_iter(self):
# find all files which look like an object, extract sha from there
for root, dirs, files in os.walk(self.root_path()):
root_base = basename(root)
if len(root_base) != 2:
continue
for f in files:
if len(f) != 38:
continue
yield hex_to_bin(root_base + f)
# END for each file
# END for each walk iteration
def size(self):
return len(tuple(self.sha_iter()))

View File

@@ -0,0 +1,110 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Contains the MemoryDatabase implementation"""
from gitdb.db.loose import LooseObjectDB
from gitdb.db.base import (
ObjectDBR,
ObjectDBW
)
from gitdb.base import (
OStream,
IStream,
)
from gitdb.exc import (
BadObject,
UnsupportedOperation
)
from gitdb.stream import (
ZippedStoreShaWriter,
DecompressMemMapReader,
)
from io import BytesIO
__all__ = ("MemoryDB", )
class MemoryDB(ObjectDBR, ObjectDBW):
"""A memory database stores everything to memory, providing fast IO and object
retrieval. It should be used to buffer results and obtain SHAs before writing
it to the actual physical storage, as it allows to query whether object already
exists in the target storage before introducing actual IO"""
def __init__(self):
super().__init__()
self._db = LooseObjectDB("path/doesnt/matter")
# maps 20 byte shas to their OStream objects
self._cache = dict()
def set_ostream(self, stream):
raise UnsupportedOperation("MemoryDB's always stream into memory")
def store(self, istream):
zstream = ZippedStoreShaWriter()
self._db.set_ostream(zstream)
istream = self._db.store(istream)
zstream.close() # close to flush
zstream.seek(0)
# don't provide a size, the stream is written in object format, hence the
# header needs decompression
decomp_stream = DecompressMemMapReader(zstream.getvalue(), close_on_deletion=False)
self._cache[istream.binsha] = OStream(istream.binsha, istream.type, istream.size, decomp_stream)
return istream
def has_object(self, sha):
return sha in self._cache
def info(self, sha):
# we always return streams, which are infos as well
return self.stream(sha)
def stream(self, sha):
try:
ostream = self._cache[sha]
# rewind stream for the next one to read
ostream.stream.seek(0)
return ostream
except KeyError as e:
raise BadObject(sha) from e
# END exception handling
def size(self):
return len(self._cache)
def sha_iter(self):
return self._cache.keys()
#{ Interface
def stream_copy(self, sha_iter, odb):
"""Copy the streams as identified by sha's yielded by sha_iter into the given odb
The streams will be copied directly
**Note:** the object will only be written if it did not exist in the target db
:return: amount of streams actually copied into odb. If smaller than the amount
of input shas, one or more objects did already exist in odb"""
count = 0
for sha in sha_iter:
if odb.has_object(sha):
continue
# END check object existence
ostream = self.stream(sha)
# compressed data including header
sio = BytesIO(ostream.stream.data())
istream = IStream(ostream.type, ostream.size, sio, sha)
odb.store(istream)
count += 1
# END for each sha
return count
#} END interface

View File

@@ -0,0 +1,206 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module containing a database to deal with packs"""
from gitdb.db.base import (
FileDBBase,
ObjectDBR,
CachingDB
)
from gitdb.util import LazyMixin
from gitdb.exc import (
BadObject,
UnsupportedOperation,
AmbiguousObjectName
)
from gitdb.pack import PackEntity
from functools import reduce
import os
import glob
__all__ = ('PackedDB', )
#{ Utilities
class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin):
"""A database operating on a set of object packs"""
# sort the priority list every N queries
# Higher values are better, performance tests don't show this has
# any effect, but it should have one
_sort_interval = 500
def __init__(self, root_path):
super().__init__(root_path)
# list of lists with three items:
# * hits - number of times the pack was hit with a request
# * entity - Pack entity instance
# * sha_to_index - PackIndexFile.sha_to_index method for direct cache query
# self._entities = list() # lazy loaded list
self._hit_count = 0 # amount of hits
self._st_mtime = 0 # last modification data of our root path
def _set_cache_(self, attr):
if attr == '_entities':
self._entities = list()
self.update_cache(force=True)
# END handle entities initialization
def _sort_entities(self):
self._entities.sort(key=lambda l: l[0], reverse=True)
def _pack_info(self, sha):
""":return: tuple(entity, index) for an item at the given sha
:param sha: 20 or 40 byte sha
:raise BadObject:
**Note:** This method is not thread-safe, but may be hit in multi-threaded
operation. The worst thing that can happen though is a counter that
was not incremented, or the list being in wrong order. So we safe
the time for locking here, lets see how that goes"""
# presort ?
if self._hit_count % self._sort_interval == 0:
self._sort_entities()
# END update sorting
for item in self._entities:
index = item[2](sha)
if index is not None:
item[0] += 1 # one hit for you
self._hit_count += 1 # general hit count
return (item[1], index)
# END index found in pack
# END for each item
# no hit, see whether we have to update packs
# NOTE: considering packs don't change very often, we safe this call
# and leave it to the super-caller to trigger that
raise BadObject(sha)
#{ Object DB Read
def has_object(self, sha):
try:
self._pack_info(sha)
return True
except BadObject:
return False
# END exception handling
def info(self, sha):
entity, index = self._pack_info(sha)
return entity.info_at_index(index)
def stream(self, sha):
entity, index = self._pack_info(sha)
return entity.stream_at_index(index)
def sha_iter(self):
for entity in self.entities():
index = entity.index()
sha_by_index = index.sha
for index in range(index.size()):
yield sha_by_index(index)
# END for each index
# END for each entity
def size(self):
sizes = [item[1].index().size() for item in self._entities]
return reduce(lambda x, y: x + y, sizes, 0)
#} END object db read
#{ object db write
def store(self, istream):
"""Storing individual objects is not feasible as a pack is designed to
hold multiple objects. Writing or rewriting packs for single objects is
inefficient"""
raise UnsupportedOperation()
#} END object db write
#{ Interface
def update_cache(self, force=False):
"""
Update our cache with the actually existing packs on disk. Add new ones,
and remove deleted ones. We keep the unchanged ones
:param force: If True, the cache will be updated even though the directory
does not appear to have changed according to its modification timestamp.
:return: True if the packs have been updated so there is new information,
False if there was no change to the pack database"""
stat = os.stat(self.root_path())
if not force and stat.st_mtime <= self._st_mtime:
return False
# END abort early on no change
self._st_mtime = stat.st_mtime
# packs are supposed to be prefixed with pack- by git-convention
# get all pack files, figure out what changed
pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))
our_pack_files = {item[1].pack().path() for item in self._entities}
# new packs
for pack_file in (pack_files - our_pack_files):
# init the hit-counter/priority with the size, a good measure for hit-
# probability. Its implemented so that only 12 bytes will be read
entity = PackEntity(pack_file)
self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])
# END for each new packfile
# removed packs
for pack_file in (our_pack_files - pack_files):
del_index = -1
for i, item in enumerate(self._entities):
if item[1].pack().path() == pack_file:
del_index = i
break
# END found index
# END for each entity
assert del_index != -1
del(self._entities[del_index])
# END for each removed pack
# reinitialize prioritiess
self._sort_entities()
return True
def entities(self):
""":return: list of pack entities operated upon by this database"""
return [item[1] for item in self._entities]
def partial_to_complete_sha(self, partial_binsha, canonical_length):
""":return: 20 byte sha as inferred by the given partial binary sha
:param partial_binsha: binary sha with less than 20 bytes
:param canonical_length: length of the corresponding canonical representation.
It is required as binary sha's cannot display whether the original hex sha
had an odd or even number of characters
:raise AmbiguousObjectName:
:raise BadObject: """
candidate = None
for item in self._entities:
item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length)
if item_index is not None:
sha = item[1].index().sha(item_index)
if candidate and candidate != sha:
raise AmbiguousObjectName(partial_binsha)
candidate = sha
# END handle full sha could be found
# END for each entity
if candidate:
return candidate
# still not found ?
raise BadObject(partial_binsha)
#} END interface

View File

@@ -0,0 +1,82 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
import codecs
from gitdb.db.base import (
CompoundDB,
)
__all__ = ('ReferenceDB', )
class ReferenceDB(CompoundDB):
"""A database consisting of database referred to in a file"""
# Configuration
# Specifies the object database to use for the paths found in the alternates
# file. If None, it defaults to the GitDB
ObjectDBCls = None
def __init__(self, ref_file):
super().__init__()
self._ref_file = ref_file
def _set_cache_(self, attr):
if attr == '_dbs':
self._dbs = list()
self._update_dbs_from_ref_file()
else:
super()._set_cache_(attr)
# END handle attrs
def _update_dbs_from_ref_file(self):
dbcls = self.ObjectDBCls
if dbcls is None:
# late import
from gitdb.db.git import GitDB
dbcls = GitDB
# END get db type
# try to get as many as possible, don't fail if some are unavailable
ref_paths = list()
try:
with codecs.open(self._ref_file, 'r', encoding="utf-8") as f:
ref_paths = [l.strip() for l in f]
except OSError:
pass
# END handle alternates
ref_paths_set = set(ref_paths)
cur_ref_paths_set = {db.root_path() for db in self._dbs}
# remove existing
for path in (cur_ref_paths_set - ref_paths_set):
for i, db in enumerate(self._dbs[:]):
if db.root_path() == path:
del(self._dbs[i])
continue
# END del matching db
# END for each path to remove
# add new
# sort them to maintain order
added_paths = sorted(ref_paths_set - cur_ref_paths_set, key=lambda p: ref_paths.index(p))
for path in added_paths:
try:
db = dbcls(path)
# force an update to verify path
if isinstance(db, CompoundDB):
db.databases()
# END verification
self._dbs.append(db)
except Exception:
# ignore invalid paths or issues
pass
# END for each path to add
def update_cache(self, force=False):
# re-read alternates and update databases
self._update_dbs_from_ref_file()
return super().update_cache(force)

View File

@@ -0,0 +1,46 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module with common exceptions"""
from gitdb.util import to_hex_sha
class ODBError(Exception):
"""All errors thrown by the object database"""
class InvalidDBRoot(ODBError):
"""Thrown if an object database cannot be initialized at the given path"""
class BadObject(ODBError):
"""The object with the given SHA does not exist. Instantiate with the
failed sha"""
def __str__(self):
return "BadObject: %s" % to_hex_sha(self.args[0])
class BadName(ODBError):
"""A name provided to rev_parse wasn't understood"""
def __str__(self):
return "Ref '%s' did not resolve to an object" % self.args[0]
class ParseError(ODBError):
"""Thrown if the parsing of a file failed due to an invalid format"""
class AmbiguousObjectName(ODBError):
"""Thrown if a possibly shortened name does not uniquely represent a single object
in the database"""
class BadObjectType(ODBError):
"""The object had an unsupported type"""
class UnsupportedOperation(ODBError):
"""Thrown if the given operation cannot be supported by the object database"""

View File

@@ -0,0 +1,704 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Contains basic c-functions which usually contain performance critical code
Keeping this code separate from the beginning makes it easier to out-source
it into c later, if required"""
import zlib
from gitdb.util import byte_ord
decompressobj = zlib.decompressobj
import mmap
from itertools import islice
from functools import reduce
from gitdb.const import NULL_BYTE, BYTE_SPACE
from gitdb.utils.encoding import force_text
from gitdb.typ import (
str_blob_type,
str_commit_type,
str_tree_type,
str_tag_type,
)
from io import StringIO
# INVARIANTS
OFS_DELTA = 6
REF_DELTA = 7
delta_types = (OFS_DELTA, REF_DELTA)
type_id_to_type_map = {
0: b'', # EXT 1
1: str_commit_type,
2: str_tree_type,
3: str_blob_type,
4: str_tag_type,
5: b'', # EXT 2
OFS_DELTA: "OFS_DELTA", # OFFSET DELTA
REF_DELTA: "REF_DELTA" # REFERENCE DELTA
}
type_to_type_id_map = {
str_commit_type: 1,
str_tree_type: 2,
str_blob_type: 3,
str_tag_type: 4,
"OFS_DELTA": OFS_DELTA,
"REF_DELTA": REF_DELTA,
}
# used when dealing with larger streams
chunk_size = 1000 * mmap.PAGESIZE
__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info',
'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data',
'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header')
#{ Structures
def _set_delta_rbound(d, size):
"""Truncate the given delta to the given size
:param size: size relative to our target offset, may not be 0, must be smaller or equal
to our size
:return: d"""
d.ts = size
# NOTE: data is truncated automatically when applying the delta
# MUST NOT DO THIS HERE
return d
def _move_delta_lbound(d, bytes):
"""Move the delta by the given amount of bytes, reducing its size so that its
right bound stays static
:param bytes: amount of bytes to move, must be smaller than delta size
:return: d"""
if bytes == 0:
return
d.to += bytes
d.so += bytes
d.ts -= bytes
if d.data is not None:
d.data = d.data[bytes:]
# END handle data
return d
def delta_duplicate(src):
return DeltaChunk(src.to, src.ts, src.so, src.data)
def delta_chunk_apply(dc, bbuf, write):
"""Apply own data to the target buffer
:param bbuf: buffer providing source bytes for copy operations
:param write: write method to call with data to write"""
if dc.data is None:
# COPY DATA FROM SOURCE
write(bbuf[dc.so:dc.so + dc.ts])
else:
# APPEND DATA
# what's faster: if + 4 function calls or just a write with a slice ?
# Considering data can be larger than 127 bytes now, it should be worth it
if dc.ts < len(dc.data):
write(dc.data[:dc.ts])
else:
write(dc.data)
# END handle truncation
# END handle chunk mode
class DeltaChunk:
"""Represents a piece of a delta, it can either add new data, or copy existing
one from a source buffer"""
__slots__ = (
'to', # start offset in the target buffer in bytes
'ts', # size of this chunk in the target buffer in bytes
'so', # start offset in the source buffer in bytes or None
'data', # chunk of bytes to be added to the target buffer,
# DeltaChunkList to use as base, or None
)
def __init__(self, to, ts, so, data):
self.to = to
self.ts = ts
self.so = so
self.data = data
def __repr__(self):
return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "")
#{ Interface
def rbound(self):
return self.to + self.ts
def has_data(self):
""":return: True if the instance has data to add to the target stream"""
return self.data is not None
#} END interface
def _closest_index(dcl, absofs):
""":return: index at which the given absofs should be inserted. The index points
to the DeltaChunk with a target buffer absofs that equals or is greater than
absofs.
**Note:** global method for performance only, it belongs to DeltaChunkList"""
lo = 0
hi = len(dcl)
while lo < hi:
mid = (lo + hi) / 2
dc = dcl[mid]
if dc.to > absofs:
hi = mid
elif dc.rbound() > absofs or dc.to == absofs:
return mid
else:
lo = mid + 1
# END handle bound
# END for each delta absofs
return len(dcl) - 1
def delta_list_apply(dcl, bbuf, write):
"""Apply the chain's changes and write the final result using the passed
write function.
:param bbuf: base buffer containing the base of all deltas contained in this
list. It will only be used if the chunk in question does not have a base
chain.
:param write: function taking a string of bytes to write to the output"""
for dc in dcl:
delta_chunk_apply(dc, bbuf, write)
# END for each dc
def delta_list_slice(dcl, absofs, size, ndcl):
""":return: Subsection of this list at the given absolute offset, with the given
size in bytes.
:return: None"""
cdi = _closest_index(dcl, absofs) # delta start index
cd = dcl[cdi]
slen = len(dcl)
lappend = ndcl.append
if cd.to != absofs:
tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
_move_delta_lbound(tcd, absofs - cd.to)
tcd.ts = min(tcd.ts, size)
lappend(tcd)
size -= tcd.ts
cdi += 1
# END lbound overlap handling
while cdi < slen and size:
# are we larger than the current block
cd = dcl[cdi]
if cd.ts <= size:
lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data))
size -= cd.ts
else:
tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data)
tcd.ts = size
lappend(tcd)
size -= tcd.ts
break
# END hadle size
cdi += 1
# END for each chunk
class DeltaChunkList(list):
"""List with special functionality to deal with DeltaChunks.
There are two types of lists we represent. The one was created bottom-up, working
towards the latest delta, the other kind was created top-down, working from the
latest delta down to the earliest ancestor. This attribute is queryable
after all processing with is_reversed."""
__slots__ = tuple()
def rbound(self):
""":return: rightmost extend in bytes, absolute"""
if len(self) == 0:
return 0
return self[-1].rbound()
def lbound(self):
""":return: leftmost byte at which this chunklist starts"""
if len(self) == 0:
return 0
return self[0].to
def size(self):
""":return: size of bytes as measured by our delta chunks"""
return self.rbound() - self.lbound()
def apply(self, bbuf, write):
"""Only used by public clients, internally we only use the global routines
for performance"""
return delta_list_apply(self, bbuf, write)
def compress(self):
"""Alter the list to reduce the amount of nodes. Currently we concatenate
add-chunks
:return: self"""
slen = len(self)
if slen < 2:
return self
i = 0
first_data_index = None
while i < slen:
dc = self[i]
i += 1
if dc.data is None:
if first_data_index is not None and i - 2 - first_data_index > 1:
# if first_data_index is not None:
nd = StringIO() # new data
so = self[first_data_index].to # start offset in target buffer
for x in range(first_data_index, i - 1):
xdc = self[x]
nd.write(xdc.data[:xdc.ts])
# END collect data
del(self[first_data_index:i - 1])
buf = nd.getvalue()
self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf))
slen = len(self)
i = first_data_index + 1
# END concatenate data
first_data_index = None
continue
# END skip non-data chunks
if first_data_index is None:
first_data_index = i - 1
# END iterate list
# if slen_orig != len(self):
# print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100)
return self
def check_integrity(self, target_size=-1):
"""Verify the list has non-overlapping chunks only, and the total size matches
target_size
:param target_size: if not -1, the total size of the chain must be target_size
:raise AssertionError: if the size doesn't match"""
if target_size > -1:
assert self[-1].rbound() == target_size
assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size
# END target size verification
if len(self) < 2:
return
# check data
for dc in self:
assert dc.ts > 0
if dc.has_data():
assert len(dc.data) >= dc.ts
# END for each dc
left = islice(self, 0, len(self) - 1)
right = iter(self)
right.next()
# this is very pythonic - we might have just use index based access here,
# but this could actually be faster
for lft, rgt in zip(left, right):
assert lft.rbound() == rgt.to
assert lft.to + lft.ts == rgt.to
# END for each pair
class TopdownDeltaChunkList(DeltaChunkList):
"""Represents a list which is generated by feeding its ancestor streams one by
one"""
__slots__ = tuple()
def connect_with_next_base(self, bdcl):
"""Connect this chain with the next level of our base delta chunklist.
The goal in this game is to mark as many of our chunks rigid, hence they
cannot be changed by any of the upcoming bases anymore. Once all our
chunks are marked like that, we can stop all processing
:param bdcl: data chunk list being one of our bases. They must be fed in
consecutively and in order, towards the earliest ancestor delta
:return: True if processing was done. Use it to abort processing of
remaining streams if False is returned"""
nfc = 0 # number of frozen chunks
dci = 0 # delta chunk index
slen = len(self) # len of self
ccl = list() # temporary list
while dci < slen:
dc = self[dci]
dci += 1
# all add-chunks which are already topmost don't need additional processing
if dc.data is not None:
nfc += 1
continue
# END skip add chunks
# copy chunks
# integrate the portion of the base list into ourselves. Lists
# dont support efficient insertion ( just one at a time ), but for now
# we live with it. Internally, its all just a 32/64bit pointer, and
# the portions of moved memory should be smallish. Maybe we just rebuild
# ourselves in order to reduce the amount of insertions ...
del(ccl[:])
delta_list_slice(bdcl, dc.so, dc.ts, ccl)
# move the target bounds into place to match with our chunk
ofs = dc.to - dc.so
for cdc in ccl:
cdc.to += ofs
# END update target bounds
if len(ccl) == 1:
self[dci - 1] = ccl[0]
else:
# maybe try to compute the expenses here, and pick the right algorithm
# It would normally be faster than copying everything physically though
# TODO: Use a deque here, and decide by the index whether to extend
# or extend left !
post_dci = self[dci:]
del(self[dci - 1:]) # include deletion of dc
self.extend(ccl)
self.extend(post_dci)
slen = len(self)
dci += len(ccl) - 1 # deleted dc, added rest
# END handle chunk replacement
# END for each chunk
if nfc == slen:
return False
# END handle completeness
return True
#} END structures
#{ Routines
def is_loose_object(m):
"""
:return: True the file contained in memory map m appears to be a loose object.
Only the first two bytes are needed"""
b0, b1 = map(ord, m[:2])
word = (b0 << 8) + b1
return b0 == 0x78 and (word % 31) == 0
def loose_object_header_info(m):
"""
:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the
object as well as its uncompressed size in bytes.
:param m: memory map from which to read the compressed object data"""
decompress_size = 8192 # is used in cgit as well
hdr = decompressobj().decompress(m, decompress_size)
type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE)
return type_name, int(size)
def pack_object_header_info(data):
"""
:return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)
The type_id should be interpreted according to the ``type_id_to_type_map`` map
The byte-offset specifies the start of the actual zlib compressed datastream
:param m: random-access memory, like a string or memory map"""
c = byte_ord(data[0]) # first byte
i = 1 # next char to read
type_id = (c >> 4) & 7 # numeric type
size = c & 15 # starting size
s = 4 # starting bit-shift size
while c & 0x80:
c = byte_ord(data[i])
i += 1
size += (c & 0x7f) << s
s += 7
# END character loop
# end performance at expense of maintenance ...
return (type_id, size, i)
def create_pack_object_header(obj_type, obj_size):
"""
:return: string defining the pack header comprised of the object type
and its incompressed size in bytes
:param obj_type: pack type_id of the object
:param obj_size: uncompressed size in bytes of the following object stream"""
c = 0 # 1 byte
hdr = bytearray() # output string
c = (obj_type << 4) | (obj_size & 0xf)
obj_size >>= 4
while obj_size:
hdr.append(c | 0x80)
c = obj_size & 0x7f
obj_size >>= 7
# END until size is consumed
hdr.append(c)
# end handle interpreter
return hdr
def msb_size(data, offset=0):
"""
:return: tuple(read_bytes, size) read the msb size from the given random
access data starting at the given byte offset"""
size = 0
i = 0
l = len(data)
hit_msb = False
while i < l:
c = data[i + offset]
size |= (c & 0x7f) << i * 7
i += 1
if not c & 0x80:
hit_msb = True
break
# END check msb bit
# END while in range
# end performance ...
if not hit_msb:
raise AssertionError("Could not find terminating MSB byte in data stream")
return i + offset, size
def loose_object_header(type, size):
"""
:return: bytes representing the loose object header, which is immediately
followed by the content stream of size 'size'"""
return ('%s %i\0' % (force_text(type), size)).encode('ascii')
def write_object(type, size, read, write, chunk_size=chunk_size):
"""
Write the object as identified by type, size and source_stream into the
target_stream
:param type: type string of the object
:param size: amount of bytes to write from source_stream
:param read: read method of a stream providing the content data
:param write: write method of the output stream
:param close_target_stream: if True, the target stream will be closed when
the routine exits, even if an error is thrown
:return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
tbw = 0 # total num bytes written
# WRITE HEADER: type SP size NULL
tbw += write(loose_object_header(type, size))
tbw += stream_copy(read, write, size, chunk_size)
return tbw
def stream_copy(read, write, size, chunk_size):
"""
Copy a stream up to size bytes using the provided read and write methods,
in chunks of chunk_size
**Note:** its much like stream_copy utility, but operates just using methods"""
dbw = 0 # num data bytes written
# WRITE ALL DATA UP TO SIZE
while True:
cs = min(chunk_size, size - dbw)
# NOTE: not all write methods return the amount of written bytes, like
# mmap.write. Its bad, but we just deal with it ... perhaps its not
# even less efficient
# data_len = write(read(cs))
# dbw += data_len
data = read(cs)
data_len = len(data)
dbw += data_len
write(data)
if data_len < cs or dbw == size:
break
# END check for stream end
# END duplicate data
return dbw
def connect_deltas(dstreams):
"""
Read the condensed delta chunk information from dstream and merge its information
into a list of existing delta chunks
:param dstreams: iterable of delta stream objects, the delta to be applied last
comes first, then all its ancestors in order
:return: DeltaChunkList, containing all operations to apply"""
tdcl = None # topmost dcl
dcl = tdcl = TopdownDeltaChunkList()
for dsi, ds in enumerate(dstreams):
# print "Stream", dsi
db = ds.read()
delta_buf_size = ds.size
# read header
i, base_size = msb_size(db)
i, target_size = msb_size(db, i)
# interpret opcodes
tbw = 0 # amount of target bytes written
while i < delta_buf_size:
c = ord(db[i])
i += 1
if c & 0x80:
cp_off, cp_size = 0, 0
if (c & 0x01):
cp_off = ord(db[i])
i += 1
if (c & 0x02):
cp_off |= (ord(db[i]) << 8)
i += 1
if (c & 0x04):
cp_off |= (ord(db[i]) << 16)
i += 1
if (c & 0x08):
cp_off |= (ord(db[i]) << 24)
i += 1
if (c & 0x10):
cp_size = ord(db[i])
i += 1
if (c & 0x20):
cp_size |= (ord(db[i]) << 8)
i += 1
if (c & 0x40):
cp_size |= (ord(db[i]) << 16)
i += 1
if not cp_size:
cp_size = 0x10000
rbound = cp_off + cp_size
if (rbound < cp_size or
rbound > base_size):
break
dcl.append(DeltaChunk(tbw, cp_size, cp_off, None))
tbw += cp_size
elif c:
# NOTE: in C, the data chunks should probably be concatenated here.
# In python, we do it as a post-process
dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c]))
i += c
tbw += c
else:
raise ValueError("unexpected delta opcode 0")
# END handle command byte
# END while processing delta data
dcl.compress()
# merge the lists !
if dsi > 0:
if not tdcl.connect_with_next_base(dcl):
break
# END handle merge
# prepare next base
dcl = DeltaChunkList()
# END for each delta stream
return tdcl
def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write):
"""
Apply data from a delta buffer using a source buffer to the target file
:param src_buf: random access data from which the delta was created
:param src_buf_size: size of the source buffer in bytes
:param delta_buf_size: size for the delta buffer in bytes
:param delta_buf: random access delta data
:param write: write method taking a chunk of bytes
**Note:** transcribed to python from the similar routine in patch-delta.c"""
i = 0
db = delta_buf
while i < delta_buf_size:
c = db[i]
i += 1
if c & 0x80:
cp_off, cp_size = 0, 0
if (c & 0x01):
cp_off = db[i]
i += 1
if (c & 0x02):
cp_off |= (db[i] << 8)
i += 1
if (c & 0x04):
cp_off |= (db[i] << 16)
i += 1
if (c & 0x08):
cp_off |= (db[i] << 24)
i += 1
if (c & 0x10):
cp_size = db[i]
i += 1
if (c & 0x20):
cp_size |= (db[i] << 8)
i += 1
if (c & 0x40):
cp_size |= (db[i] << 16)
i += 1
if not cp_size:
cp_size = 0x10000
rbound = cp_off + cp_size
if (rbound < cp_size or
rbound > src_buf_size):
break
write(src_buf[cp_off:cp_off + cp_size])
elif c:
write(db[i:i + c])
i += c
else:
raise ValueError("unexpected delta opcode 0")
# END handle command byte
# END while processing delta data
# yes, lets use the exact same error message that git uses :)
assert i == delta_buf_size, "delta replay has gone wild"
def is_equal_canonical_sha(canonical_length, match, sha1):
"""
:return: True if the given lhs and rhs 20 byte binary shas
The comparison will take the canonical_length of the match sha into account,
hence the comparison will only use the last 4 bytes for uneven canonical representations
:param match: less than 20 byte sha
:param sha1: 20 byte sha"""
binary_length = canonical_length // 2
if match[:binary_length] != sha1[:binary_length]:
return False
if canonical_length - binary_length and \
(byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0:
return False
# END handle uneven canonnical length
return True
#} END routines
try:
from gitdb_speedups._perf import connect_deltas
except ImportError:
pass

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,730 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
from io import BytesIO
import mmap
import os
import sys
import zlib
from gitdb.fun import (
msb_size,
stream_copy,
apply_delta_data,
connect_deltas,
delta_types
)
from gitdb.util import (
allocate_memory,
LazyMixin,
make_sha,
write,
close,
)
from gitdb.const import NULL_BYTE, BYTE_SPACE
from gitdb.utils.encoding import force_bytes
has_perf_mod = False
try:
from gitdb_speedups._perf import apply_delta as c_apply_delta
has_perf_mod = True
except ImportError:
pass
__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader',
'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer',
'FDStream', 'NullStream')
#{ RO Streams
class DecompressMemMapReader(LazyMixin):
"""Reads data in chunks from a memory map and decompresses it. The client sees
only the uncompressed data, respective file-like read calls are handling on-demand
buffered decompression accordingly
A constraint on the total size of bytes is activated, simulating
a logical file within a possibly larger physical memory area
To read efficiently, you clearly don't want to read individual bytes, instead,
read a few kilobytes at least.
**Note:** The chunk-size should be carefully selected as it will involve quite a bit
of string copying due to the way the zlib is implemented. Its very wasteful,
hence we try to find a good tradeoff between allocation time and number of
times we actually allocate. An own zlib implementation would be good here
to better support streamed reading - it would only need to keep the mmap
and decompress it into chunks, that's all ... """
__slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close',
'_cbr', '_phi')
max_read_size = 512 * 1024 # currently unused
def __init__(self, m, close_on_deletion, size=None):
"""Initialize with mmap for stream reading
:param m: must be content data - use new if you have object data and no size"""
self._m = m
self._zip = zlib.decompressobj()
self._buf = None # buffer of decompressed bytes
self._buflen = 0 # length of bytes in buffer
if size is not None:
self._s = size # size of uncompressed data to read in total
self._br = 0 # num uncompressed bytes read
self._cws = 0 # start byte of compression window
self._cwe = 0 # end byte of compression window
self._cbr = 0 # number of compressed bytes read
self._phi = False # is True if we parsed the header info
self._close = close_on_deletion # close the memmap on deletion ?
def _set_cache_(self, attr):
assert attr == '_s'
# only happens for size, which is a marker to indicate we still
# have to parse the header from the stream
self._parse_header_info()
def __del__(self):
self.close()
def _parse_header_info(self):
"""If this stream contains object data, parse the header info and skip the
stream to a point where each read will yield object content
:return: parsed type_string, size"""
# read header
# should really be enough, cgit uses 8192 I believe
# And for good reason !! This needs to be that high for the header to be read correctly in all cases
maxb = 8192
self._s = maxb
hdr = self.read(maxb)
hdrend = hdr.find(NULL_BYTE)
typ, size = hdr[:hdrend].split(BYTE_SPACE)
size = int(size)
self._s = size
# adjust internal state to match actual header length that we ignore
# The buffer will be depleted first on future reads
self._br = 0
hdrend += 1
self._buf = BytesIO(hdr[hdrend:])
self._buflen = len(hdr) - hdrend
self._phi = True
return typ, size
#{ Interface
@classmethod
def new(self, m, close_on_deletion=False):
"""Create a new DecompressMemMapReader instance for acting as a read-only stream
This method parses the object header from m and returns the parsed
type and size, as well as the created stream instance.
:param m: memory map on which to operate. It must be object data ( header + contents )
:param close_on_deletion: if True, the memory map will be closed once we are
being deleted"""
inst = DecompressMemMapReader(m, close_on_deletion, 0)
typ, size = inst._parse_header_info()
return typ, size, inst
def data(self):
""":return: random access compatible data we are working on"""
return self._m
def close(self):
"""Close our underlying stream of compressed bytes if this was allowed during initialization
:return: True if we closed the underlying stream
:note: can be called safely
"""
if self._close:
if hasattr(self._m, 'close'):
self._m.close()
self._close = False
# END handle resource freeing
def compressed_bytes_read(self):
"""
:return: number of compressed bytes read. This includes the bytes it
took to decompress the header ( if there was one )"""
# ABSTRACT: When decompressing a byte stream, it can be that the first
# x bytes which were requested match the first x bytes in the loosely
# compressed datastream. This is the worst-case assumption that the reader
# does, it assumes that it will get at least X bytes from X compressed bytes
# in call cases.
# The caveat is that the object, according to our known uncompressed size,
# is already complete, but there are still some bytes left in the compressed
# stream that contribute to the amount of compressed bytes.
# How can we know that we are truly done, and have read all bytes we need
# to read ?
# Without help, we cannot know, as we need to obtain the status of the
# decompression. If it is not finished, we need to decompress more data
# until it is finished, to yield the actual number of compressed bytes
# belonging to the decompressed object
# We are using a custom zlib module for this, if its not present,
# we try to put in additional bytes up for decompression if feasible
# and check for the unused_data.
# Only scrub the stream forward if we are officially done with the
# bytes we were to have.
if self._br == self._s and not self._zip.unused_data:
# manipulate the bytes-read to allow our own read method to continue
# but keep the window at its current position
self._br = 0
if hasattr(self._zip, 'status'):
while self._zip.status == zlib.Z_OK:
self.read(mmap.PAGESIZE)
# END scrub-loop custom zlib
else:
# pass in additional pages, until we have unused data
while not self._zip.unused_data and self._cbr != len(self._m):
self.read(mmap.PAGESIZE)
# END scrub-loop default zlib
# END handle stream scrubbing
# reset bytes read, just to be sure
self._br = self._s
# END handle stream scrubbing
# unused data ends up in the unconsumed tail, which was removed
# from the count already
return self._cbr
#} END interface
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
"""Allows to reset the stream to restart reading
:raise ValueError: If offset and whence are not 0"""
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
raise ValueError("Can only seek to position 0")
# END handle offset
self._zip = zlib.decompressobj()
self._br = self._cws = self._cwe = self._cbr = 0
if self._phi:
self._phi = False
del(self._s) # trigger header parsing on first access
# END skip header
def read(self, size=-1):
if size < 1:
size = self._s - self._br
else:
size = min(size, self._s - self._br)
# END clamp size
if size == 0:
return b''
# END handle depletion
# deplete the buffer, then just continue using the decompress object
# which has an own buffer. We just need this to transparently parse the
# header from the zlib stream
dat = b''
if self._buf:
if self._buflen >= size:
# have enough data
dat = self._buf.read(size)
self._buflen -= size
self._br += size
return dat
else:
dat = self._buf.read() # ouch, duplicates data
size -= self._buflen
self._br += self._buflen
self._buflen = 0
self._buf = None
# END handle buffer len
# END handle buffer
# decompress some data
# Abstract: zlib needs to operate on chunks of our memory map ( which may
# be large ), as it will otherwise and always fill in the 'unconsumed_tail'
# attribute which possible reads our whole map to the end, forcing
# everything to be read from disk even though just a portion was requested.
# As this would be a nogo, we workaround it by passing only chunks of data,
# moving the window into the memory map along as we decompress, which keeps
# the tail smaller than our chunk-size. This causes 'only' the chunk to be
# copied once, and another copy of a part of it when it creates the unconsumed
# tail. We have to use it to hand in the appropriate amount of bytes during
# the next read.
tail = self._zip.unconsumed_tail
if tail:
# move the window, make it as large as size demands. For code-clarity,
# we just take the chunk from our map again instead of reusing the unconsumed
# tail. The latter one would safe some memory copying, but we could end up
# with not getting enough data uncompressed, so we had to sort that out as well.
# Now we just assume the worst case, hence the data is uncompressed and the window
# needs to be as large as the uncompressed bytes we want to read.
self._cws = self._cwe - len(tail)
self._cwe = self._cws + size
else:
cws = self._cws
self._cws = self._cwe
self._cwe = cws + size
# END handle tail
# if window is too small, make it larger so zip can decompress something
if self._cwe - self._cws < 8:
self._cwe = self._cws + 8
# END adjust winsize
# takes a slice, but doesn't copy the data, it says ...
indata = self._m[self._cws:self._cwe]
# get the actual window end to be sure we don't use it for computations
self._cwe = self._cws + len(indata)
dcompdat = self._zip.decompress(indata, size)
# update the amount of compressed bytes read
# We feed possibly overlapping chunks, which is why the unconsumed tail
# has to be taken into consideration, as well as the unused data
# if we hit the end of the stream
# NOTE: Behavior changed in PY2.7 onward, which requires special handling to make the tests work properly.
# They are thorough, and I assume it is truly working.
# Why is this logic as convoluted as it is ? Please look at the table in
# https://github.com/gitpython-developers/gitdb/issues/19 to learn about the test-results.
# Basically, on py2.6, you want to use branch 1, whereas on all other python version, the second branch
# will be the one that works.
# However, the zlib VERSIONs as well as the platform check is used to further match the entries in the
# table in the github issue. This is it ... it was the only way I could make this work everywhere.
# IT's CERTAINLY GOING TO BITE US IN THE FUTURE ... .
if zlib.ZLIB_VERSION in ('1.2.7', '1.2.5') and not sys.platform == 'darwin':
unused_datalen = len(self._zip.unconsumed_tail)
else:
unused_datalen = len(self._zip.unconsumed_tail) + len(self._zip.unused_data)
# # end handle very special case ...
self._cbr += len(indata) - unused_datalen
self._br += len(dcompdat)
if dat:
dcompdat = dat + dcompdat
# END prepend our cached data
# it can happen, depending on the compression, that we get less bytes
# than ordered as it needs the final portion of the data as well.
# Recursively resolve that.
# Note: dcompdat can be empty even though we still appear to have bytes
# to read, if we are called by compressed_bytes_read - it manipulates
# us to empty the stream
if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s:
dcompdat += self.read(size - len(dcompdat))
# END handle special case
return dcompdat
class DeltaApplyReader(LazyMixin):
"""A reader which dynamically applies pack deltas to a base object, keeping the
memory demands to a minimum.
The size of the final object is only obtainable once all deltas have been
applied, unless it is retrieved from a pack index.
The uncompressed Delta has the following layout (MSB being a most significant
bit encoded dynamic size):
* MSB Source Size - the size of the base against which the delta was created
* MSB Target Size - the size of the resulting data after the delta was applied
* A list of one byte commands (cmd) which are followed by a specific protocol:
* cmd & 0x80 - copy delta_data[offset:offset+size]
* Followed by an encoded offset into the delta data
* Followed by an encoded size of the chunk to copy
* cmd & 0x7f - insert
* insert cmd bytes from the delta buffer into the output stream
* cmd == 0 - invalid operation ( or error in delta stream )
"""
__slots__ = (
"_bstream", # base stream to which to apply the deltas
"_dstreams", # tuple of delta stream readers
"_mm_target", # memory map of the delta-applied data
"_size", # actual number of bytes in _mm_target
"_br" # number of bytes read
)
#{ Configuration
k_max_memory_move = 250 * 1000 * 1000
#} END configuration
def __init__(self, stream_list):
"""Initialize this instance with a list of streams, the first stream being
the delta to apply on top of all following deltas, the last stream being the
base object onto which to apply the deltas"""
assert len(stream_list) > 1, "Need at least one delta and one base stream"
self._bstream = stream_list[-1]
self._dstreams = tuple(stream_list[:-1])
self._br = 0
def _set_cache_too_slow_without_c(self, attr):
# the direct algorithm is fastest and most direct if there is only one
# delta. Also, the extra overhead might not be worth it for items smaller
# than X - definitely the case in python, every function call costs
# huge amounts of time
# if len(self._dstreams) * self._bstream.size < self.k_max_memory_move:
if len(self._dstreams) == 1:
return self._set_cache_brute_(attr)
# Aggregate all deltas into one delta in reverse order. Hence we take
# the last delta, and reverse-merge its ancestor delta, until we receive
# the final delta data stream.
dcl = connect_deltas(self._dstreams)
# call len directly, as the (optional) c version doesn't implement the sequence
# protocol
if dcl.rbound() == 0:
self._size = 0
self._mm_target = allocate_memory(0)
return
# END handle empty list
self._size = dcl.rbound()
self._mm_target = allocate_memory(self._size)
bbuf = allocate_memory(self._bstream.size)
stream_copy(self._bstream.read, bbuf.write, self._bstream.size, 256 * mmap.PAGESIZE)
# APPLY CHUNKS
write = self._mm_target.write
dcl.apply(bbuf, write)
self._mm_target.seek(0)
def _set_cache_brute_(self, attr):
"""If we are here, we apply the actual deltas"""
# TODO: There should be a special case if there is only one stream
# Then the default-git algorithm should perform a tad faster, as the
# delta is not peaked into, causing less overhead.
buffer_info_list = list()
max_target_size = 0
for dstream in self._dstreams:
buf = dstream.read(512) # read the header information + X
offset, src_size = msb_size(buf)
offset, target_size = msb_size(buf, offset)
buffer_info_list.append((buf[offset:], offset, src_size, target_size))
max_target_size = max(max_target_size, target_size)
# END for each delta stream
# sanity check - the first delta to apply should have the same source
# size as our actual base stream
base_size = self._bstream.size
target_size = max_target_size
# if we have more than 1 delta to apply, we will swap buffers, hence we must
# assure that all buffers we use are large enough to hold all the results
if len(self._dstreams) > 1:
base_size = target_size = max(base_size, max_target_size)
# END adjust buffer sizes
# Allocate private memory map big enough to hold the first base buffer
# We need random access to it
bbuf = allocate_memory(base_size)
stream_copy(self._bstream.read, bbuf.write, base_size, 256 * mmap.PAGESIZE)
# allocate memory map large enough for the largest (intermediate) target
# We will use it as scratch space for all delta ops. If the final
# target buffer is smaller than our allocated space, we just use parts
# of it upon return.
tbuf = allocate_memory(target_size)
# for each delta to apply, memory map the decompressed delta and
# work on the op-codes to reconstruct everything.
# For the actual copying, we use a seek and write pattern of buffer
# slices.
final_target_size = None
for (dbuf, offset, src_size, target_size), dstream in zip(reversed(buffer_info_list), reversed(self._dstreams)):
# allocate a buffer to hold all delta data - fill in the data for
# fast access. We do this as we know that reading individual bytes
# from our stream would be slower than necessary ( although possible )
# The dbuf buffer contains commands after the first two MSB sizes, the
# offset specifies the amount of bytes read to get the sizes.
ddata = allocate_memory(dstream.size - offset)
ddata.write(dbuf)
# read the rest from the stream. The size we give is larger than necessary
stream_copy(dstream.read, ddata.write, dstream.size, 256 * mmap.PAGESIZE)
#######################################################################
if 'c_apply_delta' in globals():
c_apply_delta(bbuf, ddata, tbuf)
else:
apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write)
#######################################################################
# finally, swap out source and target buffers. The target is now the
# base for the next delta to apply
bbuf, tbuf = tbuf, bbuf
bbuf.seek(0)
tbuf.seek(0)
final_target_size = target_size
# END for each delta to apply
# its already seeked to 0, constrain it to the actual size
# NOTE: in the end of the loop, it swaps buffers, hence our target buffer
# is not tbuf, but bbuf !
self._mm_target = bbuf
self._size = final_target_size
#{ Configuration
if not has_perf_mod:
_set_cache_ = _set_cache_brute_
else:
_set_cache_ = _set_cache_too_slow_without_c
#} END configuration
def read(self, count=0):
bl = self._size - self._br # bytes left
if count < 1 or count > bl:
count = bl
# NOTE: we could check for certain size limits, and possibly
# return buffers instead of strings to prevent byte copying
data = self._mm_target.read(count)
self._br += len(data)
return data
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
"""Allows to reset the stream to restart reading
:raise ValueError: If offset and whence are not 0"""
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
raise ValueError("Can only seek to position 0")
# END handle offset
self._br = 0
self._mm_target.seek(0)
#{ Interface
@classmethod
def new(cls, stream_list):
"""
Convert the given list of streams into a stream which resolves deltas
when reading from it.
:param stream_list: two or more stream objects, first stream is a Delta
to the object that you want to resolve, followed by N additional delta
streams. The list's last stream must be a non-delta stream.
:return: Non-Delta OPackStream object whose stream can be used to obtain
the decompressed resolved data
:raise ValueError: if the stream list cannot be handled"""
if len(stream_list) < 2:
raise ValueError("Need at least two streams")
# END single object special handling
if stream_list[-1].type_id in delta_types:
raise ValueError(
"Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type)
# END check stream
return cls(stream_list)
#} END interface
#{ OInfo like Interface
@property
def type(self):
return self._bstream.type
@property
def type_id(self):
return self._bstream.type_id
@property
def size(self):
""":return: number of uncompressed bytes in the stream"""
return self._size
#} END oinfo like interface
#} END RO streams
#{ W Streams
class Sha1Writer:
"""Simple stream writer which produces a sha whenever you like as it degests
everything it is supposed to write"""
__slots__ = "sha1"
def __init__(self):
self.sha1 = make_sha()
#{ Stream Interface
def write(self, data):
""":raise IOError: If not all bytes could be written
:param data: byte object
:return: length of incoming data"""
self.sha1.update(data)
return len(data)
# END stream interface
#{ Interface
def sha(self, as_hex=False):
""":return: sha so far
:param as_hex: if True, sha will be hex-encoded, binary otherwise"""
if as_hex:
return self.sha1.hexdigest()
return self.sha1.digest()
#} END interface
class FlexibleSha1Writer(Sha1Writer):
"""Writer producing a sha1 while passing on the written bytes to the given
write function"""
__slots__ = 'writer'
def __init__(self, writer):
Sha1Writer.__init__(self)
self.writer = writer
def write(self, data):
Sha1Writer.write(self, data)
self.writer(data)
class ZippedStoreShaWriter(Sha1Writer):
"""Remembers everything someone writes to it and generates a sha"""
__slots__ = ('buf', 'zip')
def __init__(self):
Sha1Writer.__init__(self)
self.buf = BytesIO()
self.zip = zlib.compressobj(zlib.Z_BEST_SPEED)
def __getattr__(self, attr):
return getattr(self.buf, attr)
def write(self, data):
alen = Sha1Writer.write(self, data)
self.buf.write(self.zip.compress(data))
return alen
def close(self):
self.buf.write(self.zip.flush())
def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)):
"""Seeking currently only supports to rewind written data
Multiple writes are not supported"""
if offset != 0 or whence != getattr(os, 'SEEK_SET', 0):
raise ValueError("Can only seek to position 0")
# END handle offset
self.buf.seek(0)
def getvalue(self):
""":return: string value from the current stream position to the end"""
return self.buf.getvalue()
class FDCompressedSha1Writer(Sha1Writer):
"""Digests data written to it, making the sha available, then compress the
data and write it to the file descriptor
**Note:** operates on raw file descriptors
**Note:** for this to work, you have to use the close-method of this instance"""
__slots__ = ("fd", "sha1", "zip")
# default exception
exc = IOError("Failed to write all bytes to filedescriptor")
def __init__(self, fd):
super().__init__()
self.fd = fd
self.zip = zlib.compressobj(zlib.Z_BEST_SPEED)
#{ Stream Interface
def write(self, data):
""":raise IOError: If not all bytes could be written
:return: length of incoming data"""
self.sha1.update(data)
cdata = self.zip.compress(data)
bytes_written = write(self.fd, cdata)
if bytes_written != len(cdata):
raise self.exc
return len(data)
def close(self):
remainder = self.zip.flush()
if write(self.fd, remainder) != len(remainder):
raise self.exc
return close(self.fd)
#} END stream interface
class FDStream:
"""A simple wrapper providing the most basic functions on a file descriptor
with the fileobject interface. Cannot use os.fdopen as the resulting stream
takes ownership"""
__slots__ = ("_fd", '_pos')
def __init__(self, fd):
self._fd = fd
self._pos = 0
def write(self, data):
self._pos += len(data)
os.write(self._fd, data)
def read(self, count=0):
if count == 0:
count = os.path.getsize(self._filepath)
# END handle read everything
bytes = os.read(self._fd, count)
self._pos += len(bytes)
return bytes
def fileno(self):
return self._fd
def tell(self):
return self._pos
def close(self):
close(self._fd)
class NullStream:
"""A stream that does nothing but providing a stream interface.
Use it like /dev/null"""
__slots__ = tuple()
def read(self, size=0):
return ''
def close(self):
pass
def write(self, data):
return len(data)
#} END W streams

View File

@@ -0,0 +1,4 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php

View File

@@ -0,0 +1,192 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Utilities used in ODB testing"""
from gitdb import OStream
import sys
import random
from array import array
from io import BytesIO
import glob
import unittest
import tempfile
import shutil
import os
import gc
import logging
from functools import wraps
#{ Bases
class TestBase(unittest.TestCase):
"""Base class for all tests
TestCase providing access to readonly repositories using the following member variables.
* gitrepopath
* read-only base path of the git source repository, i.e. .../git/.git
"""
#{ Invvariants
k_env_git_repo = "GITDB_TEST_GIT_REPO_BASE"
#} END invariants
@classmethod
def setUpClass(cls):
try:
super().setUpClass()
except AttributeError:
pass
cls.gitrepopath = os.environ.get(cls.k_env_git_repo)
if not cls.gitrepopath:
logging.info(
"You can set the %s environment variable to a .git repository of your choice - defaulting to the gitdb repository", cls.k_env_git_repo)
ospd = os.path.dirname
cls.gitrepopath = os.path.join(ospd(ospd(ospd(__file__))), '.git')
# end assure gitrepo is set
assert cls.gitrepopath.endswith('.git')
#} END bases
#{ Decorators
def with_rw_directory(func):
"""Create a temporary directory which can be written to, remove it if the
test succeeds, but leave it otherwise to aid additional debugging"""
def wrapper(self):
path = tempfile.mktemp(prefix=func.__name__)
os.mkdir(path)
keep = False
try:
try:
return func(self, path)
except Exception:
sys.stderr.write(f"Test {type(self).__name__}.{func.__name__} failed, output is at {path!r}\n")
keep = True
raise
finally:
# Need to collect here to be sure all handles have been closed. It appears
# a windows-only issue. In fact things should be deleted, as well as
# memory maps closed, once objects go out of scope. For some reason
# though this is not the case here unless we collect explicitly.
if not keep:
gc.collect()
shutil.rmtree(path)
# END handle exception
# END wrapper
wrapper.__name__ = func.__name__
return wrapper
def with_packs_rw(func):
"""Function that provides a path into which the packs for testing should be
copied. Will pass on the path to the actual function afterwards"""
def wrapper(self, path):
src_pack_glob = fixture_path('packs/*')
copy_files_globbed(src_pack_glob, path, hard_link_ok=True)
return func(self, path)
# END wrapper
wrapper.__name__ = func.__name__
return wrapper
#} END decorators
#{ Routines
def fixture_path(relapath=''):
""":return: absolute path into the fixture directory
:param relapath: relative path into the fixtures directory, or ''
to obtain the fixture directory itself"""
return os.path.join(os.path.dirname(__file__), 'fixtures', relapath)
def copy_files_globbed(source_glob, target_dir, hard_link_ok=False):
"""Copy all files found according to the given source glob into the target directory
:param hard_link_ok: if True, hard links will be created if possible. Otherwise
the files will be copied"""
for src_file in glob.glob(source_glob):
if hard_link_ok and hasattr(os, 'link'):
target = os.path.join(target_dir, os.path.basename(src_file))
try:
os.link(src_file, target)
except OSError:
shutil.copy(src_file, target_dir)
# END handle cross device links ( and resulting failure )
else:
shutil.copy(src_file, target_dir)
# END try hard link
# END for each file to copy
def make_bytes(size_in_bytes, randomize=False):
""":return: string with given size in bytes
:param randomize: try to produce a very random stream"""
actual_size = size_in_bytes // 4
producer = range(actual_size)
if randomize:
producer = list(producer)
random.shuffle(producer)
# END randomize
a = array('i', producer)
return a.tobytes()
def make_object(type, data):
""":return: bytes resembling an uncompressed object"""
odata = "blob %i\0" % len(data)
return odata.encode("ascii") + data
def make_memory_file(size_in_bytes, randomize=False):
""":return: tuple(size_of_stream, stream)
:param randomize: try to produce a very random stream"""
d = make_bytes(size_in_bytes, randomize)
return len(d), BytesIO(d)
#} END routines
#{ Stream Utilities
class DummyStream:
def __init__(self):
self.was_read = False
self.bytes = 0
self.closed = False
def read(self, size):
self.was_read = True
self.bytes = size
def close(self):
self.closed = True
def _assert(self):
assert self.was_read
class DeriveTest(OStream):
def __init__(self, sha, type, size, stream, *args, **kwargs):
self.myarg = kwargs.pop('myarg')
self.args = args
def _assert(self):
assert self.args
assert self.myarg
#} END stream utilitiess

View File

@@ -0,0 +1,105 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Test for object db"""
from gitdb.test.lib import (
TestBase,
DummyStream,
DeriveTest,
)
from gitdb import (
OInfo,
OPackInfo,
ODeltaPackInfo,
OStream,
OPackStream,
ODeltaPackStream,
IStream
)
from gitdb.util import (
NULL_BIN_SHA
)
from gitdb.typ import (
str_blob_type
)
class TestBaseTypes(TestBase):
def test_streams(self):
# test info
sha = NULL_BIN_SHA
s = 20
blob_id = 3
info = OInfo(sha, str_blob_type, s)
assert info.binsha == sha
assert info.type == str_blob_type
assert info.type_id == blob_id
assert info.size == s
# test pack info
# provides type_id
pinfo = OPackInfo(0, blob_id, s)
assert pinfo.type == str_blob_type
assert pinfo.type_id == blob_id
assert pinfo.pack_offset == 0
dpinfo = ODeltaPackInfo(0, blob_id, s, sha)
assert dpinfo.type == str_blob_type
assert dpinfo.type_id == blob_id
assert dpinfo.delta_info == sha
assert dpinfo.pack_offset == 0
# test ostream
stream = DummyStream()
ostream = OStream(*(info + (stream, )))
assert ostream.stream is stream
ostream.read(15)
stream._assert()
assert stream.bytes == 15
ostream.read(20)
assert stream.bytes == 20
# test packstream
postream = OPackStream(*(pinfo + (stream, )))
assert postream.stream is stream
postream.read(10)
stream._assert()
assert stream.bytes == 10
# test deltapackstream
dpostream = ODeltaPackStream(*(dpinfo + (stream, )))
dpostream.stream is stream
dpostream.read(5)
stream._assert()
assert stream.bytes == 5
# derive with own args
DeriveTest(sha, str_blob_type, s, stream, 'mine', myarg=3)._assert()
# test istream
istream = IStream(str_blob_type, s, stream)
assert istream.binsha == None
istream.binsha = sha
assert istream.binsha == sha
assert len(istream.binsha) == 20
assert len(istream.hexsha) == 40
assert istream.size == s
istream.size = s * 2
istream.size == s * 2
assert istream.type == str_blob_type
istream.type = "something"
assert istream.type == "something"
assert istream.stream is stream
istream.stream = None
assert istream.stream is None
assert istream.error is None
istream.error = Exception()
assert isinstance(istream.error, Exception)

View File

@@ -0,0 +1,43 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module with examples from the tutorial section of the docs"""
import os
from gitdb.test.lib import TestBase
from gitdb import IStream
from gitdb.db import LooseObjectDB
from io import BytesIO
class TestExamples(TestBase):
def test_base(self):
ldb = LooseObjectDB(os.path.join(self.gitrepopath, 'objects'))
for sha1 in ldb.sha_iter():
oinfo = ldb.info(sha1)
ostream = ldb.stream(sha1)
assert oinfo[:3] == ostream[:3]
assert len(ostream.read()) == ostream.size
assert ldb.has_object(oinfo.binsha)
# END for each sha in database
# assure we close all files
try:
del(ostream)
del(oinfo)
except UnboundLocalError:
pass
# END ignore exception if there are no loose objects
data = b"my data"
istream = IStream("blob", len(data), BytesIO(data))
# the object does not yet have a sha
assert istream.binsha is None
ldb.store(istream)
# now the sha is set
assert len(istream.binsha) == 20
assert ldb.has_object(istream.binsha)

View File

@@ -0,0 +1,249 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Test everything about packs reading and writing"""
from gitdb.test.lib import (
TestBase,
with_rw_directory,
fixture_path
)
from gitdb.stream import DeltaApplyReader
from gitdb.pack import (
PackEntity,
PackIndexFile,
PackFile
)
from gitdb.base import (
OInfo,
OStream,
)
from gitdb.fun import delta_types
from gitdb.exc import UnsupportedOperation
from gitdb.util import to_bin_sha
import pytest
import os
import tempfile
#{ Utilities
def bin_sha_from_filename(filename):
return to_bin_sha(os.path.splitext(os.path.basename(filename))[0][5:])
#} END utilities
class TestPack(TestBase):
packindexfile_v1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx'), 1, 67)
packindexfile_v2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx'), 2, 30)
packindexfile_v2_3_ascii = (fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx'), 2, 42)
packfile_v2_1 = (fixture_path('packs/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack'), 2, packindexfile_v1[2])
packfile_v2_2 = (fixture_path('packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack'), 2, packindexfile_v2[2])
packfile_v2_3_ascii = (
fixture_path('packs/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack'), 2, packindexfile_v2_3_ascii[2])
def _assert_index_file(self, index, version, size):
assert index.packfile_checksum() != index.indexfile_checksum()
assert len(index.packfile_checksum()) == 20
assert len(index.indexfile_checksum()) == 20
assert index.version() == version
assert index.size() == size
assert len(index.offsets()) == size
# get all data of all objects
for oidx in range(index.size()):
sha = index.sha(oidx)
assert oidx == index.sha_to_index(sha)
entry = index.entry(oidx)
assert len(entry) == 3
assert entry[0] == index.offset(oidx)
assert entry[1] == sha
assert entry[2] == index.crc(oidx)
# verify partial sha
for l in (4, 8, 11, 17, 20):
assert index.partial_sha_to_index(sha[:l], l * 2) == oidx
# END for each object index in indexfile
self.assertRaises(ValueError, index.partial_sha_to_index, "\0", 2)
def _assert_pack_file(self, pack, version, size):
assert pack.version() == 2
assert pack.size() == size
assert len(pack.checksum()) == 20
num_obj = 0
for obj in pack.stream_iter():
num_obj += 1
info = pack.info(obj.pack_offset)
stream = pack.stream(obj.pack_offset)
assert info.pack_offset == stream.pack_offset
assert info.type_id == stream.type_id
assert hasattr(stream, 'read')
# it should be possible to read from both streams
assert obj.read() == stream.read()
streams = pack.collect_streams(obj.pack_offset)
assert streams
# read the stream
try:
dstream = DeltaApplyReader.new(streams)
except ValueError:
# ignore these, old git versions use only ref deltas,
# which we haven't resolved ( as we are without an index )
# Also ignore non-delta streams
continue
# END get deltastream
# read all
data = dstream.read()
assert len(data) == dstream.size
# test seek
dstream.seek(0)
assert dstream.read() == data
# read chunks
# NOTE: the current implementation is safe, it basically transfers
# all calls to the underlying memory map
# END for each object
assert num_obj == size
def test_pack_index(self):
# check version 1 and 2
for indexfile, version, size in (self.packindexfile_v1, self.packindexfile_v2):
index = PackIndexFile(indexfile)
self._assert_index_file(index, version, size)
# END run tests
def test_pack(self):
# there is this special version 3, but apparently its like 2 ...
for packfile, version, size in (self.packfile_v2_3_ascii, self.packfile_v2_1, self.packfile_v2_2):
pack = PackFile(packfile)
self._assert_pack_file(pack, version, size)
# END for each pack to test
@with_rw_directory
def test_pack_entity(self, rw_dir):
pack_objs = list()
for packinfo, indexinfo in ((self.packfile_v2_1, self.packindexfile_v1),
(self.packfile_v2_2, self.packindexfile_v2),
(self.packfile_v2_3_ascii, self.packindexfile_v2_3_ascii)):
packfile, version, size = packinfo
indexfile, version, size = indexinfo
entity = PackEntity(packfile)
assert entity.pack().path() == packfile
assert entity.index().path() == indexfile
pack_objs.extend(entity.stream_iter())
count = 0
for info, stream in zip(entity.info_iter(), entity.stream_iter()):
count += 1
assert info.binsha == stream.binsha
assert len(info.binsha) == 20
assert info.type_id == stream.type_id
assert info.size == stream.size
# we return fully resolved items, which is implied by the sha centric access
assert not info.type_id in delta_types
# try all calls
assert len(entity.collect_streams(info.binsha))
oinfo = entity.info(info.binsha)
assert isinstance(oinfo, OInfo)
assert oinfo.binsha is not None
ostream = entity.stream(info.binsha)
assert isinstance(ostream, OStream)
assert ostream.binsha is not None
# verify the stream
try:
assert entity.is_valid_stream(info.binsha, use_crc=True)
except UnsupportedOperation:
pass
# END ignore version issues
assert entity.is_valid_stream(info.binsha, use_crc=False)
# END for each info, stream tuple
assert count == size
# END for each entity
# pack writing - write all packs into one
# index path can be None
pack_path1 = tempfile.mktemp('', "pack1", rw_dir)
pack_path2 = tempfile.mktemp('', "pack2", rw_dir)
index_path = tempfile.mktemp('', 'index', rw_dir)
iteration = 0
def rewind_streams():
for obj in pack_objs:
obj.stream.seek(0)
# END utility
for ppath, ipath, num_obj in zip((pack_path1, pack_path2),
(index_path, None),
(len(pack_objs), None)):
iwrite = None
if ipath:
ifile = open(ipath, 'wb')
iwrite = ifile.write
# END handle ip
# make sure we rewind the streams ... we work on the same objects over and over again
if iteration > 0:
rewind_streams()
# END rewind streams
iteration += 1
with open(ppath, 'wb') as pfile:
pack_sha, index_sha = PackEntity.write_pack(pack_objs, pfile.write, iwrite, object_count=num_obj)
assert os.path.getsize(ppath) > 100
# verify pack
pf = PackFile(ppath)
assert pf.size() == len(pack_objs)
assert pf.version() == PackFile.pack_version_default
assert pf.checksum() == pack_sha
pf.close()
# verify index
if ipath is not None:
ifile.close()
assert os.path.getsize(ipath) > 100
idx = PackIndexFile(ipath)
assert idx.version() == PackIndexFile.index_version_default
assert idx.packfile_checksum() == pack_sha
assert idx.indexfile_checksum() == index_sha
assert idx.size() == len(pack_objs)
idx.close()
# END verify files exist
# END for each packpath, indexpath pair
# verify the packs thoroughly
rewind_streams()
entity = PackEntity.create(pack_objs, rw_dir)
count = 0
for info in entity.info_iter():
count += 1
for use_crc in range(2):
assert entity.is_valid_stream(info.binsha, use_crc)
# END for each crc mode
# END for each info
assert count == len(pack_objs)
entity.close()
def test_pack_64(self):
# TODO: hex-edit a pack helping us to verify that we can handle 64 byte offsets
# of course without really needing such a huge pack
pytest.skip('not implemented')

View File

@@ -0,0 +1,164 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Test for object db"""
from gitdb.test.lib import (
TestBase,
DummyStream,
make_bytes,
make_object,
fixture_path
)
from gitdb import (
DecompressMemMapReader,
FDCompressedSha1Writer,
LooseObjectDB,
Sha1Writer,
MemoryDB,
IStream,
)
from gitdb.util import hex_to_bin
import zlib
from gitdb.typ import (
str_blob_type
)
import tempfile
import os
from io import BytesIO
class TestStream(TestBase):
"""Test stream classes"""
data_sizes = (15, 10000, 1000 * 1024 + 512)
def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None):
"""Make stream tests - the orig_stream is seekable, allowing it to be
rewound and reused
:param cdata: the data we expect to read from stream, the contents
:param rewind_stream: function called to rewind the stream to make it ready
for reuse"""
ns = 10
assert len(cdata) > ns - 1, "Data must be larger than %i, was %i" % (ns, len(cdata))
# read in small steps
ss = len(cdata) // ns
for i in range(ns):
data = stream.read(ss)
chunk = cdata[i * ss:(i + 1) * ss]
assert data == chunk
# END for each step
rest = stream.read()
if rest:
assert rest == cdata[-len(rest):]
# END handle rest
if isinstance(stream, DecompressMemMapReader):
assert len(stream.data()) == stream.compressed_bytes_read()
# END handle special type
rewind_stream(stream)
# read everything
rdata = stream.read()
assert rdata == cdata
if isinstance(stream, DecompressMemMapReader):
assert len(stream.data()) == stream.compressed_bytes_read()
# END handle special type
def test_decompress_reader(self):
for close_on_deletion in range(2):
for with_size in range(2):
for ds in self.data_sizes:
cdata = make_bytes(ds, randomize=False)
# zdata = zipped actual data
# cdata = original content data
# create reader
if with_size:
# need object data
zdata = zlib.compress(make_object(str_blob_type, cdata))
typ, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion)
assert size == len(cdata)
assert typ == str_blob_type
# even if we don't set the size, it will be set automatically on first read
test_reader = DecompressMemMapReader(zdata, close_on_deletion=False)
assert test_reader._s == len(cdata)
else:
# here we need content data
zdata = zlib.compress(cdata)
reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata))
assert reader._s == len(cdata)
# END get reader
self._assert_stream_reader(reader, cdata, lambda r: r.seek(0))
# put in a dummy stream for closing
dummy = DummyStream()
reader._m = dummy
assert not dummy.closed
del(reader)
assert dummy.closed == close_on_deletion
# END for each datasize
# END whether size should be used
# END whether stream should be closed when deleted
def test_sha_writer(self):
writer = Sha1Writer()
assert 2 == writer.write(b"hi")
assert len(writer.sha(as_hex=1)) == 40
assert len(writer.sha(as_hex=0)) == 20
# make sure it does something ;)
prev_sha = writer.sha()
writer.write(b"hi again")
assert writer.sha() != prev_sha
def test_compressed_writer(self):
for ds in self.data_sizes:
fd, path = tempfile.mkstemp()
ostream = FDCompressedSha1Writer(fd)
data = make_bytes(ds, randomize=False)
# for now, just a single write, code doesn't care about chunking
assert len(data) == ostream.write(data)
ostream.close()
# its closed already
self.assertRaises(OSError, os.close, fd)
# read everything back, compare to data we zip
fd = os.open(path, os.O_RDONLY | getattr(os, 'O_BINARY', 0))
written_data = os.read(fd, os.path.getsize(path))
assert len(written_data) == os.path.getsize(path)
os.close(fd)
assert written_data == zlib.compress(data, 1) # best speed
os.remove(path)
# END for each os
def test_decompress_reader_special_case(self):
odb = LooseObjectDB(fixture_path('objects'))
mdb = MemoryDB()
for sha in (b'888401851f15db0eed60eb1bc29dec5ddcace911',
b'7bb839852ed5e3a069966281bb08d50012fb309b',):
ostream = odb.stream(hex_to_bin(sha))
# if there is a bug, we will be missing one byte exactly !
data = ostream.read()
assert len(data) == ostream.size
# Putting it back in should yield nothing new - after all, we have
dump = mdb.store(IStream(ostream.type, ostream.size, BytesIO(data)))
assert dump.hexsha == sha
# end for each loose object sha to test

View File

@@ -0,0 +1,100 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Test for object db"""
import tempfile
import os
from gitdb.test.lib import TestBase
from gitdb.util import (
to_hex_sha,
to_bin_sha,
NULL_HEX_SHA,
LockedFD
)
class TestUtils(TestBase):
def test_basics(self):
assert to_hex_sha(NULL_HEX_SHA) == NULL_HEX_SHA
assert len(to_bin_sha(NULL_HEX_SHA)) == 20
assert to_hex_sha(to_bin_sha(NULL_HEX_SHA)) == NULL_HEX_SHA.encode("ascii")
def _cmp_contents(self, file_path, data):
# raise if data from file at file_path
# does not match data string
with open(file_path, "rb") as fp:
assert fp.read() == data.encode("ascii")
def test_lockedfd(self):
my_file = tempfile.mktemp()
orig_data = "hello"
new_data = "world"
with open(my_file, "wb") as my_file_fp:
my_file_fp.write(orig_data.encode("ascii"))
try:
lfd = LockedFD(my_file)
lockfilepath = lfd._lockfilepath()
# cannot end before it was started
self.assertRaises(AssertionError, lfd.rollback)
self.assertRaises(AssertionError, lfd.commit)
# open for writing
assert not os.path.isfile(lockfilepath)
wfd = lfd.open(write=True)
assert lfd._fd is wfd
assert os.path.isfile(lockfilepath)
# write data and fail
os.write(wfd, new_data.encode("ascii"))
lfd.rollback()
assert lfd._fd is None
self._cmp_contents(my_file, orig_data)
assert not os.path.isfile(lockfilepath)
# additional call doesn't fail
lfd.commit()
lfd.rollback()
# test reading
lfd = LockedFD(my_file)
rfd = lfd.open(write=False)
assert os.read(rfd, len(orig_data)) == orig_data.encode("ascii")
assert os.path.isfile(lockfilepath)
# deletion rolls back
del(lfd)
assert not os.path.isfile(lockfilepath)
# write data - concurrently
lfd = LockedFD(my_file)
olfd = LockedFD(my_file)
assert not os.path.isfile(lockfilepath)
wfdstream = lfd.open(write=True, stream=True) # this time as stream
assert os.path.isfile(lockfilepath)
# another one fails
self.assertRaises(IOError, olfd.open)
wfdstream.write(new_data.encode("ascii"))
lfd.commit()
assert not os.path.isfile(lockfilepath)
self._cmp_contents(my_file, new_data)
# could test automatic _end_writing on destruction
finally:
os.remove(my_file)
# END final cleanup
# try non-existing file for reading
lfd = LockedFD(tempfile.mktemp())
try:
lfd.open(write=False)
except OSError:
assert not os.path.exists(lfd._lockfilepath())
else:
self.fail("expected OSError")
# END handle exceptions

View File

@@ -0,0 +1,10 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
"""Module containing information about types known to the database"""
str_blob_type = b'blob'
str_commit_type = b'commit'
str_tree_type = b'tree'
str_tag_type = b'tag'

View File

@@ -0,0 +1,398 @@
# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
#
# This module is part of GitDB and is released under
# the New BSD License: http://www.opensource.org/licenses/bsd-license.php
import binascii
import os
import mmap
import sys
import time
import errno
from io import BytesIO
from smmap import (
StaticWindowMapManager,
SlidingWindowMapManager,
SlidingWindowMapBuffer
)
# initialize our global memory manager instance
# Use it to free cached (and unused) resources.
mman = SlidingWindowMapManager()
# END handle mman
import hashlib
try:
from struct import unpack_from
except ImportError:
from struct import unpack, calcsize
__calcsize_cache = dict()
def unpack_from(fmt, data, offset=0):
try:
size = __calcsize_cache[fmt]
except KeyError:
size = calcsize(fmt)
__calcsize_cache[fmt] = size
# END exception handling
return unpack(fmt, data[offset: offset + size])
# END own unpack_from implementation
#{ Aliases
hex_to_bin = binascii.a2b_hex
bin_to_hex = binascii.b2a_hex
# errors
ENOENT = errno.ENOENT
# os shortcuts
exists = os.path.exists
mkdir = os.mkdir
chmod = os.chmod
isdir = os.path.isdir
isfile = os.path.isfile
rename = os.rename
dirname = os.path.dirname
basename = os.path.basename
join = os.path.join
read = os.read
write = os.write
close = os.close
fsync = os.fsync
def _retry(func, *args, **kwargs):
# Wrapper around functions, that are problematic on "Windows". Sometimes
# the OS or someone else has still a handle to the file
if sys.platform == "win32":
for _ in range(10):
try:
return func(*args, **kwargs)
except Exception:
time.sleep(0.1)
return func(*args, **kwargs)
else:
return func(*args, **kwargs)
def remove(*args, **kwargs):
return _retry(os.remove, *args, **kwargs)
# Backwards compatibility imports
from gitdb.const import (
NULL_BIN_SHA,
NULL_HEX_SHA
)
#} END Aliases
#{ compatibility stuff ...
class _RandomAccessBytesIO:
"""Wrapper to provide required functionality in case memory maps cannot or may
not be used. This is only really required in python 2.4"""
__slots__ = '_sio'
def __init__(self, buf=''):
self._sio = BytesIO(buf)
def __getattr__(self, attr):
return getattr(self._sio, attr)
def __len__(self):
return len(self.getvalue())
def __getitem__(self, i):
return self.getvalue()[i]
def __getslice__(self, start, end):
return self.getvalue()[start:end]
def byte_ord(b):
"""
Return the integer representation of the byte string. This supports Python
3 byte arrays as well as standard strings.
"""
try:
return ord(b)
except TypeError:
return b
#} END compatibility stuff ...
#{ Routines
def make_sha(source=b''):
"""A python2.4 workaround for the sha/hashlib module fiasco
**Note** From the dulwich project """
try:
return hashlib.sha1(source)
except NameError:
import sha
sha1 = sha.sha(source)
return sha1
def allocate_memory(size):
""":return: a file-protocol accessible memory block of the given size"""
if size == 0:
return _RandomAccessBytesIO(b'')
# END handle empty chunks gracefully
try:
return mmap.mmap(-1, size) # read-write by default
except OSError:
# setup real memory instead
# this of course may fail if the amount of memory is not available in
# one chunk - would only be the case in python 2.4, being more likely on
# 32 bit systems.
return _RandomAccessBytesIO(b"\0" * size)
# END handle memory allocation
def file_contents_ro(fd, stream=False, allow_mmap=True):
""":return: read-only contents of the file represented by the file descriptor fd
:param fd: file descriptor opened for reading
:param stream: if False, random access is provided, otherwise the stream interface
is provided.
:param allow_mmap: if True, its allowed to map the contents into memory, which
allows large files to be handled and accessed efficiently. The file-descriptor
will change its position if this is False"""
try:
if allow_mmap:
# supports stream and random access
try:
return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
except OSError:
# python 2.4 issue, 0 wants to be the actual size
return mmap.mmap(fd, os.fstat(fd).st_size, access=mmap.ACCESS_READ)
# END handle python 2.4
except OSError:
pass
# END exception handling
# read manually
contents = os.read(fd, os.fstat(fd).st_size)
if stream:
return _RandomAccessBytesIO(contents)
return contents
def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0):
"""Get the file contents at filepath as fast as possible
:return: random access compatible memory of the given filepath
:param stream: see ``file_contents_ro``
:param allow_mmap: see ``file_contents_ro``
:param flags: additional flags to pass to os.open
:raise OSError: If the file could not be opened
**Note** for now we don't try to use O_NOATIME directly as the right value needs to be
shared per database in fact. It only makes a real difference for loose object
databases anyway, and they use it with the help of the ``flags`` parameter"""
fd = os.open(filepath, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags)
try:
return file_contents_ro(fd, stream, allow_mmap)
finally:
close(fd)
# END assure file is closed
def sliding_ro_buffer(filepath, flags=0):
"""
:return: a buffer compatible object which uses our mapped memory manager internally
ready to read the whole given filepath"""
return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags)
def to_hex_sha(sha):
""":return: hexified version of sha"""
if len(sha) == 40:
return sha
return bin_to_hex(sha)
def to_bin_sha(sha):
if len(sha) == 20:
return sha
return hex_to_bin(sha)
#} END routines
#{ Utilities
class LazyMixin:
"""
Base class providing an interface to lazily retrieve attribute values upon
first access. If slots are used, memory will only be reserved once the attribute
is actually accessed and retrieved the first time. All future accesses will
return the cached value as stored in the Instance's dict or slot.
"""
__slots__ = tuple()
def __getattr__(self, attr):
"""
Whenever an attribute is requested that we do not know, we allow it
to be created and set. Next time the same attribute is requested, it is simply
returned from our dict/slots. """
self._set_cache_(attr)
# will raise in case the cache was not created
return object.__getattribute__(self, attr)
def _set_cache_(self, attr):
"""
This method should be overridden in the derived class.
It should check whether the attribute named by attr can be created
and cached. Do nothing if you do not know the attribute or call your subclass
The derived class may create as many additional attributes as it deems
necessary in case a git command returns more information than represented
in the single attribute."""
pass
class LockedFD:
"""
This class facilitates a safe read and write operation to a file on disk.
If we write to 'file', we obtain a lock file at 'file.lock' and write to
that instead. If we succeed, the lock file will be renamed to overwrite
the original file.
When reading, we obtain a lock file, but to prevent other writers from
succeeding while we are reading the file.
This type handles error correctly in that it will assure a consistent state
on destruction.
**note** with this setup, parallel reading is not possible"""
__slots__ = ("_filepath", '_fd', '_write')
def __init__(self, filepath):
"""Initialize an instance with the givne filepath"""
self._filepath = filepath
self._fd = None
self._write = None # if True, we write a file
def __del__(self):
# will do nothing if the file descriptor is already closed
if self._fd is not None:
self.rollback()
def _lockfilepath(self):
return "%s.lock" % self._filepath
def open(self, write=False, stream=False):
"""
Open the file descriptor for reading or writing, both in binary mode.
:param write: if True, the file descriptor will be opened for writing. Other
wise it will be opened read-only.
:param stream: if True, the file descriptor will be wrapped into a simple stream
object which supports only reading or writing
:return: fd to read from or write to. It is still maintained by this instance
and must not be closed directly
:raise IOError: if the lock could not be retrieved
:raise OSError: If the actual file could not be opened for reading
**note** must only be called once"""
if self._write is not None:
raise AssertionError("Called %s multiple times" % self.open)
self._write = write
# try to open the lock file
binary = getattr(os, 'O_BINARY', 0)
lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary
try:
fd = os.open(self._lockfilepath(), lockmode, int("600", 8))
if not write:
os.close(fd)
else:
self._fd = fd
# END handle file descriptor
except OSError as e:
raise OSError("Lock at %r could not be obtained" % self._lockfilepath()) from e
# END handle lock retrieval
# open actual file if required
if self._fd is None:
# we could specify exclusive here, as we obtained the lock anyway
try:
self._fd = os.open(self._filepath, os.O_RDONLY | binary)
except:
# assure we release our lockfile
remove(self._lockfilepath())
raise
# END handle lockfile
# END open descriptor for reading
if stream:
# need delayed import
from gitdb.stream import FDStream
return FDStream(self._fd)
else:
return self._fd
# END handle stream
def commit(self):
"""When done writing, call this function to commit your changes into the
actual file.
The file descriptor will be closed, and the lockfile handled.
**Note** can be called multiple times"""
self._end_writing(successful=True)
def rollback(self):
"""Abort your operation without any changes. The file descriptor will be
closed, and the lock released.
**Note** can be called multiple times"""
self._end_writing(successful=False)
def _end_writing(self, successful=True):
"""Handle the lock according to the write mode """
if self._write is None:
raise AssertionError("Cannot end operation if it wasn't started yet")
if self._fd is None:
return
os.close(self._fd)
self._fd = None
lockfile = self._lockfilepath()
if self._write and successful:
# on windows, rename does not silently overwrite the existing one
if sys.platform == "win32":
if isfile(self._filepath):
remove(self._filepath)
# END remove if exists
# END win32 special handling
os.rename(lockfile, self._filepath)
# assure others can at least read the file - the tmpfile left it at rw--
# We may also write that file, on windows that boils down to a remove-
# protection as well
chmod(self._filepath, int("644", 8))
else:
# just delete the file so far, we failed
remove(lockfile)
# END successful handling
#} END utilities

View File

@@ -0,0 +1,18 @@
def force_bytes(data, encoding="utf-8"):
if isinstance(data, bytes):
return data
if isinstance(data, str):
return data.encode(encoding)
return data
def force_text(data, encoding="utf-8"):
if isinstance(data, str):
return data
if isinstance(data, bytes):
return data.decode(encoding)
return str(data, encoding)