Skip to content

Commit ae5a69f

Browse files
committed
commit: redesigned revlist and commit parsing, commits are always retrieved from their object information directly. This is faster, and resolves issues with the rev-list format and empty commit messages
Adjusted many tests to go with the changes, as they were still mocked. The mock was removed if necessary and replaced by code that actually executes
1 parent 4e1c89e commit ae5a69f

File tree

5 files changed

+217
-265
lines changed

5 files changed

+217
-265
lines changed

lib/git/objects/commit.py

+40-58
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,12 @@ def _get_intermediate_items(cls, commit):
106106
return commit.parents
107107

108108
def _set_cache_(self, attr):
109-
"""
110-
Called by LazyMixin superclass when the given uninitialized member needs
109+
""" Called by LazyMixin superclass when the given uninitialized member needs
111110
to be set.
112-
We set all values at once.
113-
"""
111+
We set all values at once. """
114112
if attr in Commit.__slots__:
115113
# read the data in a chunk, its faster - then provide a file wrapper
114+
# Could use self.data, but lets try to get it with less calls
116115
hexsha, typename, size, data = self.repo.git.get_object_data(self)
117116
self._deserialize(StringIO(data))
118117
else:
@@ -181,16 +180,16 @@ def iter_items(cls, repo, rev, paths='', **kwargs):
181180
Returns
182181
iterator yielding Commit items
183182
"""
184-
options = {'pretty': 'raw', 'as_process' : True }
185-
options.update(kwargs)
186-
183+
if 'pretty' in kwargs:
184+
raise ValueError("--pretty cannot be used as parsing expects single sha's only")
185+
# END handle pretty
187186
args = list()
188187
if paths:
189188
args.extend(('--', paths))
190189
# END if paths
191190

192-
proc = repo.git.rev_list(rev, args, **options)
193-
return cls._iter_from_process_or_stream(repo, proc, True)
191+
proc = repo.git.rev_list(rev, args, as_process=True, **kwargs)
192+
return cls._iter_from_process_or_stream(repo, proc)
194193

195194
def iter_parents(self, paths='', **kwargs):
196195
"""
@@ -235,35 +234,30 @@ def stats(self):
235234
return stats.Stats._list_from_string(self.repo, text)
236235

237236
@classmethod
238-
def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list):
239-
"""
240-
Parse out commit information into a list of Commit objects
241-
242-
``repo``
243-
is the Repo
244-
245-
``proc``
246-
git-rev-list process instance (raw format)
237+
def _iter_from_process_or_stream(cls, repo, proc_or_stream):
238+
"""Parse out commit information into a list of Commit objects
239+
We expect one-line per commit, and parse the actual commit information directly
240+
from our lighting fast object database
247241
248-
``from_rev_list``
249-
If True, the stream was created by rev-list in which case we parse
250-
the message differently
251-
Returns
252-
iterator returning Commit objects
253-
"""
242+
:param proc: git-rev-list process instance - one sha per line
243+
:return: iterator returning Commit objects"""
254244
stream = proc_or_stream
255245
if not hasattr(stream,'readline'):
256246
stream = proc_or_stream.stdout
257247

248+
readline = stream.readline
258249
while True:
259-
line = stream.readline()
250+
line = readline()
260251
if not line:
261252
break
262-
commit_tokens = line.split()
263-
id = commit_tokens[1]
264-
assert commit_tokens[0] == "commit"
253+
sha = line.strip()
254+
if len(sha) > 40:
255+
# split additional information, as returned by bisect for instance
256+
sha, rest = line.split(None, 1)
257+
# END handle extra info
265258

266-
yield Commit(repo, id)._deserialize(stream, from_rev_list)
259+
assert len(sha) == 40, "Invalid line: %s" % sha
260+
yield Commit(repo, sha)
267261
# END for each line in stream
268262

269263

@@ -386,15 +380,16 @@ def _serialize(self, stream):
386380
# for now, this is very inefficient and in fact shouldn't be used like this
387381
return super(Commit, self)._serialize(stream)
388382

389-
def _deserialize(self, stream, from_rev_list=False):
383+
def _deserialize(self, stream):
390384
""":param from_rev_list: if true, the stream format is coming from the rev-list command
391385
Otherwise it is assumed to be a plain data stream from our object"""
392-
self.tree = Tree(self.repo, stream.readline().split()[1], 0, '')
386+
readline = stream.readline
387+
self.tree = Tree(self.repo, readline().split()[1], 0, '')
393388

394389
self.parents = list()
395390
next_line = None
396391
while True:
397-
parent_line = stream.readline()
392+
parent_line = readline()
398393
if not parent_line.startswith('parent'):
399394
next_line = parent_line
400395
break
@@ -404,37 +399,24 @@ def _deserialize(self, stream, from_rev_list=False):
404399
self.parents = tuple(self.parents)
405400

406401
self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line)
407-
self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline())
402+
self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(readline())
408403

409404

410-
# empty line
405+
# now we can have the encoding line, or an empty line followed by the optional
406+
# message.
411407
self.encoding = self.default_encoding
412-
enc = stream.readline()
413-
enc.strip()
408+
# read encoding or empty line to separate message
409+
enc = readline()
410+
enc = enc.strip()
414411
if enc:
415412
self.encoding = enc[enc.find(' ')+1:]
416-
# END parse encoding
417-
418-
message_lines = list()
419-
if from_rev_list:
420-
while True:
421-
msg_line = stream.readline()
422-
if not msg_line.startswith(' '):
423-
# and forget about this empty marker
424-
# cut the last newline to get rid of the artificial newline added
425-
# by rev-list command. Lets hope its just linux style \n
426-
message_lines[-1] = message_lines[-1][:-1]
427-
break
428-
# END abort message reading
429-
# strip leading 4 spaces
430-
message_lines.append(msg_line[4:])
431-
# END while there are message lines
432-
self.message = ''.join(message_lines)
433-
else:
434-
# a stream from our data simply gives us the plain message
435-
# The end of our message stream is marked with a newline that we strip
436-
self.message = stream.read()[:-1]
437-
# END message parsing
413+
# now comes the message separator
414+
readline()
415+
# END handle encoding
416+
417+
# a stream from our data simply gives us the plain message
418+
# The end of our message stream is marked with a newline that we strip
419+
self.message = stream.read()[:-1]
438420
return self
439421

440422
#} END serializable implementation

test/fixtures/rev_list

+3-24
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,3 @@
1-
commit 4c8124ffcf4039d292442eeccabdeca5af5c5017
2-
tree 672eca9b7f9e09c22dcb128c283e8c3c8d7697a4
3-
parent 634396b2f541a9f2d58b00be1a07f0c358b999b3
4-
author Tom Preston-Werner <[email protected]> 1191999972 -0700
5-
committer Tom Preston-Werner <[email protected]> 1191999972 -0700
6-
7-
implement Grit#heads
8-
9-
commit 634396b2f541a9f2d58b00be1a07f0c358b999b3
10-
tree b35b4bf642d667fdd613eebcfe4e17efd420fb8a
11-
author Tom Preston-Werner <[email protected]> 1191997100 -0700
12-
committer Tom Preston-Werner <[email protected]> 1191997100 -0700
13-
14-
initial grit setup
15-
16-
commit ab25fd8483882c3bda8a458ad2965d2248654335
17-
tree c20b5ec543bde1e43a931449b196052c06ed8acc
18-
parent 6e64c55896aabb9a7d8e9f8f296f426d21a78c2c
19-
parent 7f874954efb9ba35210445be456c74e037ba6af2
20-
author Tom Preston-Werner <[email protected]> 1182645538 -0700
21-
committer Tom Preston-Werner <[email protected]> 1182645538 -0700
22-
23-
Merge branch 'site'
24-
Some other stuff
1+
4c8124ffcf4039d292442eeccabdeca5af5c5017
2+
634396b2f541a9f2d58b00be1a07f0c358b999b3
3+
ab25fd8483882c3bda8a458ad2965d2248654335

test/git/performance/test_commit.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
# This module is part of GitPython and is released under
55
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
66

7-
from test.testlib import *
7+
from lib import *
88
from git import *
99
from time import time
1010
import sys
1111

12-
class TestPerformance(TestBase):
12+
class TestPerformance(TestBigRepoReadOnly):
1313

1414
# ref with about 100 commits in its history
1515
ref_100 = '0.1.6'
@@ -48,7 +48,7 @@ def test_commit_traversal(self):
4848
# bound to cat-file parsing performance
4949
nc = 0
5050
st = time()
51-
for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False):
51+
for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False):
5252
nc += 1
5353
self._query_commit_info(c)
5454
# END for each traversed commit
@@ -59,7 +59,7 @@ def test_commit_iteration(self):
5959
# bound to stream parsing performance
6060
nc = 0
6161
st = time()
62-
for c in Commit.iter_items(self.rorepo, self.ref_100):
62+
for c in Commit.iter_items(self.gitrepo, self.head_sha_2k):
6363
nc += 1
6464
self._query_commit_info(c)
6565
# END for each traversed commit

0 commit comments

Comments
 (0)