2012-02-05 18:17:53 +00:00
|
|
|
#!/usr/bin/python
|
|
|
|
# __________ __ ___.
|
|
|
|
# Open \______ \ ____ ____ | | _\_ |__ _______ ___
|
|
|
|
# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
|
|
|
|
# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
|
|
|
|
# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
|
|
|
|
# \/ \/ \/ \/ \/
|
|
|
|
#
|
|
|
|
# Copyright (c) 2012 Dominik Riebeling
|
|
|
|
#
|
|
|
|
# All files in this archive are subject to the GNU General Public License.
|
|
|
|
# See the file COPYING in the source tree root for full license agreement.
|
|
|
|
#
|
|
|
|
# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
|
|
|
|
# KIND, either express or implied.
|
|
|
|
#
|
|
|
|
|
|
|
|
'''Scrape files from a git repository.
|
|
|
|
|
|
|
|
This module provides functions to get a subset of files from a git repository.
|
|
|
|
The files to retrieve can be specified, and the git tree to work on can be
|
2012-05-14 21:01:19 +00:00
|
|
|
specified. That way arbitrary trees can be retrieved (like a subset of files
|
2012-02-05 18:17:53 +00:00
|
|
|
for a given tag).
|
|
|
|
|
|
|
|
Retrieved files can be packaged into a bzip2 compressed tarball or stored in a
|
|
|
|
given folder for processing afterwards.
|
|
|
|
|
|
|
|
Calls git commands directly for maximum compatibility.
|
|
|
|
'''
|
|
|
|
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
import os
|
|
|
|
import tarfile
|
|
|
|
import tempfile
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
def get_refs(repo):
|
|
|
|
'''Get dict matching refs to hashes from repository pointed to by repo.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@return Dict matching hashes to each ref.
|
|
|
|
'''
|
2012-04-22 19:32:35 +00:00
|
|
|
print("Getting list of refs")
|
2019-11-17 10:55:46 +00:00
|
|
|
output = subprocess.Popen(
|
2020-12-15 19:49:18 +00:00
|
|
|
["git", "show-ref", "--abbrev", "--head"],
|
2019-11-17 10:55:46 +00:00
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
|
2012-02-05 18:17:53 +00:00
|
|
|
cmdout = output.communicate()
|
2019-11-17 10:55:46 +00:00
|
|
|
refs = dict()
|
2012-02-05 18:17:53 +00:00
|
|
|
|
|
|
|
if len(cmdout[1]) > 0:
|
2012-04-22 19:32:35 +00:00
|
|
|
print("An error occured!\n")
|
|
|
|
print(cmdout[1])
|
2012-02-05 18:17:53 +00:00
|
|
|
return refs
|
|
|
|
|
|
|
|
for line in cmdout:
|
2019-11-17 10:55:46 +00:00
|
|
|
regex = re.findall(b'([a-f0-9]+)\\s+(\\S+)', line)
|
2012-02-05 18:17:53 +00:00
|
|
|
for r in regex:
|
|
|
|
# ref is the key, hash its value.
|
2012-04-22 19:32:35 +00:00
|
|
|
refs[r[1].decode()] = r[0].decode()
|
2012-02-05 18:17:53 +00:00
|
|
|
|
|
|
|
return refs
|
|
|
|
|
|
|
|
|
2019-11-17 10:55:46 +00:00
|
|
|
def get_lstree(repo, start, filterlist=None):
|
2012-02-05 18:17:53 +00:00
|
|
|
'''Get recursive list of tree objects for a given tree.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param start Hash identifying the tree.
|
|
|
|
@param filterlist List of paths to retrieve objecs hashes for.
|
|
|
|
An empty list will retrieve all paths.
|
|
|
|
@return Dict mapping filename to blob hash
|
|
|
|
'''
|
2019-11-17 10:55:46 +00:00
|
|
|
if filterlist is None:
|
|
|
|
filterlist = list()
|
|
|
|
output = subprocess.Popen(
|
|
|
|
["git", "ls-tree", "-r", start],
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
|
2012-02-05 18:17:53 +00:00
|
|
|
cmdout = output.communicate()
|
2019-11-17 10:55:46 +00:00
|
|
|
objects = dict()
|
2012-02-05 18:17:53 +00:00
|
|
|
|
|
|
|
if len(cmdout[1]) > 0:
|
2012-04-22 19:32:35 +00:00
|
|
|
print("An error occured!\n")
|
|
|
|
print(cmdout[1])
|
2012-02-05 18:17:53 +00:00
|
|
|
return objects
|
|
|
|
|
2012-04-22 19:32:35 +00:00
|
|
|
for line in cmdout[0].decode().split('\n'):
|
2019-11-17 10:55:46 +00:00
|
|
|
regex = re.findall(b'([0-9]+)\\s+([a-z]+)\\s+([0-9a-f]+)\\s+(.*)',
|
|
|
|
line.encode())
|
2012-02-05 18:17:53 +00:00
|
|
|
for rf in regex:
|
|
|
|
# filter
|
|
|
|
add = False
|
|
|
|
for f in filterlist:
|
2012-04-22 19:32:35 +00:00
|
|
|
if rf[3].decode().find(f) == 0:
|
2012-02-05 18:17:53 +00:00
|
|
|
add = True
|
|
|
|
|
|
|
|
# If two files have the same content they have the same hash, so
|
|
|
|
# the filename has to be used as key.
|
|
|
|
if len(filterlist) == 0 or add == True:
|
|
|
|
if rf[3] in objects:
|
2012-04-22 19:32:35 +00:00
|
|
|
print("FATAL: key already exists in dict!")
|
2012-02-05 18:17:53 +00:00
|
|
|
return {}
|
2013-06-08 21:56:33 +00:00
|
|
|
objects[rf[3].decode()] = rf[2].decode()
|
2012-02-05 18:17:53 +00:00
|
|
|
return objects
|
|
|
|
|
|
|
|
|
2012-04-29 09:38:23 +00:00
|
|
|
def get_file_timestamp(repo, tree, filename):
|
|
|
|
'''Get timestamp for a file.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param tree Hash of tree to use.
|
|
|
|
@param filename Filename in tree
|
|
|
|
@return Timestamp as string.
|
|
|
|
'''
|
|
|
|
output = subprocess.Popen(
|
2019-11-17 10:55:46 +00:00
|
|
|
["git", "log", "--format=%ai", "-n", "1", tree, filename],
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
|
2012-04-29 09:38:23 +00:00
|
|
|
cmdout = output.communicate()
|
|
|
|
|
|
|
|
return cmdout[0].decode().rstrip()
|
|
|
|
|
|
|
|
|
2012-02-05 18:17:53 +00:00
|
|
|
def get_object(repo, blob, destfile):
|
|
|
|
'''Get an identified object from the repository.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param blob hash for blob to retrieve.
|
|
|
|
@param destfile filename for blob output.
|
|
|
|
@return True if file was successfully written, False on error.
|
|
|
|
'''
|
2019-11-17 10:55:46 +00:00
|
|
|
output = subprocess.Popen(
|
|
|
|
["git", "cat-file", "-p", blob],
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
|
2012-02-05 18:17:53 +00:00
|
|
|
cmdout = output.communicate()
|
|
|
|
# make sure output path exists
|
|
|
|
if len(cmdout[1]) > 0:
|
2012-04-22 19:32:35 +00:00
|
|
|
print("An error occured!\n")
|
|
|
|
print(cmdout[1])
|
2012-02-05 18:17:53 +00:00
|
|
|
return False
|
|
|
|
if not os.path.exists(os.path.dirname(destfile)):
|
|
|
|
os.makedirs(os.path.dirname(destfile))
|
2012-02-05 23:14:25 +00:00
|
|
|
f = open(destfile, 'wb')
|
2012-04-22 19:32:35 +00:00
|
|
|
f.write(cmdout[0])
|
2012-02-05 18:17:53 +00:00
|
|
|
f.close()
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def describe_treehash(repo, treehash):
|
|
|
|
'''Retrieve output of git-describe for a given hash.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param treehash Hash identifying the tree / commit to describe.
|
|
|
|
@return Description string.
|
|
|
|
'''
|
2019-11-17 10:55:46 +00:00
|
|
|
output = subprocess.Popen(
|
|
|
|
["git", "describe", treehash],
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
|
2012-02-05 18:17:53 +00:00
|
|
|
cmdout = output.communicate()
|
|
|
|
if len(cmdout[1]) > 0:
|
2012-04-22 19:32:35 +00:00
|
|
|
print("An error occured!\n")
|
|
|
|
print(cmdout[1])
|
2012-02-05 18:17:53 +00:00
|
|
|
return ""
|
|
|
|
return cmdout[0].rstrip()
|
|
|
|
|
|
|
|
|
2019-11-17 10:55:46 +00:00
|
|
|
def scrape_files(repo, treehash, filelist, dest=None, timestamp_files=None):
|
2012-02-05 18:17:53 +00:00
|
|
|
'''Scrape list of files from repository.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param treehash Hash identifying the tree.
|
|
|
|
@param filelist List of files to get from repository.
|
|
|
|
@param dest Destination path for files. Files will get retrieved with full
|
|
|
|
path from the repository, and the folder structure will get
|
|
|
|
created below dest as necessary.
|
2012-04-29 09:38:23 +00:00
|
|
|
@param timestamp_files List of files to also get the last modified date.
|
|
|
|
WARNING: this is SLOW!
|
|
|
|
@return Destination path, filename:timestamp dict.
|
2012-02-05 18:17:53 +00:00
|
|
|
'''
|
2012-04-22 19:32:35 +00:00
|
|
|
print("Scraping files from repository")
|
2012-02-05 18:17:53 +00:00
|
|
|
|
2019-11-17 10:55:46 +00:00
|
|
|
if timestamp_files is None:
|
|
|
|
timestamp_files = list()
|
|
|
|
if dest is None:
|
2012-02-05 18:17:53 +00:00
|
|
|
dest = tempfile.mkdtemp()
|
|
|
|
treeobjects = get_lstree(repo, treehash, filelist)
|
2012-04-29 09:38:23 +00:00
|
|
|
timestamps = {}
|
2012-02-05 18:17:53 +00:00
|
|
|
for obj in treeobjects:
|
2013-06-08 21:56:33 +00:00
|
|
|
get_object(repo, treeobjects[obj], os.path.join(dest, obj))
|
2012-04-29 09:38:23 +00:00
|
|
|
for f in timestamp_files:
|
|
|
|
if obj.find(f) == 0:
|
|
|
|
timestamps[obj] = get_file_timestamp(repo, treehash, obj)
|
2012-02-05 18:17:53 +00:00
|
|
|
|
2012-04-29 09:38:23 +00:00
|
|
|
return [dest, timestamps]
|
2012-02-05 18:17:53 +00:00
|
|
|
|
|
|
|
|
2019-11-17 10:55:46 +00:00
|
|
|
def archive_files(repo, treehash, filelist, basename, tmpfolder=None,
|
|
|
|
archive="tbz"):
|
2012-02-05 18:17:53 +00:00
|
|
|
'''Archive list of files into tarball.
|
|
|
|
@param repo Path to repository root.
|
|
|
|
@param treehash Hash identifying the tree.
|
|
|
|
@param filelist List of files to archive. All files in the archive if left
|
|
|
|
empty.
|
|
|
|
@param basename Basename (including path) of output file. Will get used as
|
|
|
|
basename inside of the archive as well (i.e. no tarbomb).
|
|
|
|
@param tmpfolder Folder to put intermediate files in. If no folder is given
|
|
|
|
a temporary one will get used.
|
2012-04-12 19:08:38 +00:00
|
|
|
@param archive Type of archive to create. Supported values are "tbz" and
|
|
|
|
"7z". The latter requires the 7z binary available in the
|
|
|
|
system's path.
|
2012-02-05 18:17:53 +00:00
|
|
|
@return Output filename.
|
|
|
|
'''
|
|
|
|
|
2019-11-17 10:55:46 +00:00
|
|
|
if tmpfolder is None:
|
2012-04-12 19:08:38 +00:00
|
|
|
temp_remove = True
|
|
|
|
tmpfolder = tempfile.mkdtemp()
|
|
|
|
else:
|
|
|
|
temp_remove = False
|
2019-11-17 10:55:46 +00:00
|
|
|
workfolder = scrape_files(
|
|
|
|
repo, treehash, filelist, os.path.join(tmpfolder, basename))[0]
|
2020-07-09 14:36:06 +00:00
|
|
|
if basename == "":
|
2012-04-12 19:08:38 +00:00
|
|
|
return ""
|
2012-04-22 19:32:35 +00:00
|
|
|
print("Archiving files from repository")
|
2012-04-12 19:08:38 +00:00
|
|
|
if archive == "7z":
|
|
|
|
outfile = basename + ".7z"
|
2019-11-17 10:55:46 +00:00
|
|
|
output = subprocess.Popen(
|
|
|
|
["7z", "a", os.path.join(os.getcwd(), basename + ".7z"), basename],
|
2012-04-12 19:08:38 +00:00
|
|
|
cwd=tmpfolder, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
output.communicate()
|
2012-04-29 09:38:23 +00:00
|
|
|
elif archive == "tbz":
|
2012-04-12 19:08:38 +00:00
|
|
|
outfile = basename + ".tar.bz2"
|
|
|
|
tf = tarfile.open(outfile, "w:bz2")
|
|
|
|
tf.add(workfolder, basename)
|
|
|
|
tf.close()
|
2012-04-29 09:38:23 +00:00
|
|
|
else:
|
|
|
|
print("Files not archived")
|
2012-02-05 18:17:53 +00:00
|
|
|
if tmpfolder != workfolder:
|
|
|
|
shutil.rmtree(workfolder)
|
2012-04-12 19:08:38 +00:00
|
|
|
if temp_remove:
|
|
|
|
shutil.rmtree(tmpfolder)
|
2012-02-05 18:17:53 +00:00
|
|
|
return outfile
|