Skip to content

Commit 7242876

Browse files
committed
WIP: Support converting repos to Git LFS
Git LFS allows uses to commit new files to the LFS store, but replacing _old_ files requires rewriting history, which is something the BFG is pretty good at. This rough cut allows replacing blobs with pointer files throughout repo history. Some caveats with this initial implementation: * the BFG cleans concurrently, files may unnecessarily be hashed more than once * the working directory isn't updated * specifying `-fi *.png` should be unnecessary, should use gitattributes * need for `--no-blob-protection` is a hangover from normal BFG behaviour Example invocation: ``` $ git clone https://github.com/guardian/membership-frontend.git $ cd membership-frontend $ java -jar bfg.jar --convert-to-git-lfs -fi *.png --no-blob-protection ... $ ls .git/lfs/objects/ | head -2 0145f7c304ef33a43cc946e0a57b2213d24dcaf8462f3d3b332407a8b258369c 07010d5ddea536da56ebdbbb28386921c94abd476046a245b35cd47e8eb6e426 $ git reset --hard $ cat frontend/assets/images/favicons/152x152.png version https://git-lfs.github.com/spec/v1 oid sha256:0145f7c304ef33a43cc946e0a57b2213d24dcaf8462f3d3b332407a8b258369c size 1935 $ ``` https://git-lfs.github.com/ https://github.com/github/git-lfs/blob/5eb9bb01/docs/spec.md#the-pointer
1 parent b9949fe commit 7242876

File tree

4 files changed

+117
-3
lines changed

4 files changed

+117
-3
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright (c) 2015 Roberto Tyley
3+
*
4+
* This file is part of 'BFG Repo-Cleaner' - a tool for removing large
5+
* or troublesome blobs from Git repositories.
6+
*
7+
* BFG Repo-Cleaner is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* BFG Repo-Cleaner is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see http://www.gnu.org/licenses/ .
19+
*/
20+
21+
package com.madgag.git.bfg.cleaner
22+
23+
import java.nio.charset.Charset
24+
import java.security.{DigestInputStream, MessageDigest}
25+
26+
import com.google.common.io.ByteStreams
27+
import com.madgag.git.ThreadLocalObjectDatabaseResources
28+
import com.madgag.git.bfg.model.{FileName, TreeBlobEntry}
29+
import org.apache.commons.codec.binary.Hex.encodeHexString
30+
import org.eclipse.jgit.lib.Constants.OBJ_BLOB
31+
import org.eclipse.jgit.lib.ObjectLoader
32+
33+
import scala.util.Try
34+
import scalax.file.Path
35+
import scalax.file.Path.createTempFile
36+
import scalax.io.Resource
37+
38+
trait LfsBlobConverter extends TreeBlobModifier {
39+
40+
val threadLocalObjectDBResources: ThreadLocalObjectDatabaseResources
41+
42+
val lfsSuitableFiles: (FileName => Boolean)
43+
44+
val charset = Charset.forName("UTF-8")
45+
46+
val lfsObjectsDir: Path
47+
48+
override def fix(entry: TreeBlobEntry) = {
49+
val oid = (for {
50+
_ <- Some(entry.filename) filter lfsSuitableFiles
51+
loader = threadLocalObjectDBResources.reader().open(entry.objectId)
52+
(shaHex, lfsPath) <- buildLfsFileFrom(loader)
53+
} yield {
54+
val pointer =
55+
s"""|version https://git-lfs.github.com/spec/v1
56+
|oid sha256:$shaHex
57+
|size ${loader.getSize}
58+
|""".stripMargin
59+
60+
threadLocalObjectDBResources.inserter().insert(OBJ_BLOB, pointer.getBytes(charset))
61+
}).getOrElse(entry.objectId)
62+
63+
(entry.mode, oid)
64+
}
65+
66+
def buildLfsFileFrom(loader: ObjectLoader): Option[(String, Path)] = {
67+
val tmpFile = createTempFile()
68+
69+
val digest = MessageDigest.getInstance("SHA-256")
70+
71+
for {
72+
inStream <- Resource.fromInputStream(new DigestInputStream(loader.openStream(), digest))
73+
outStream <- tmpFile.outputStream()
74+
} ByteStreams.copy(inStream, outStream)
75+
76+
val shaHex = encodeHexString(digest.digest())
77+
78+
val lfsPath = lfsObjectsDir / shaHex
79+
80+
val ensureLfsFile = Try(if (!lfsPath.exists) tmpFile moveTo lfsPath).recover {
81+
case _ => lfsPath.size.contains(loader.getSize)
82+
}
83+
84+
Try(tmpFile.delete(force = true))
85+
86+
for (_ <- ensureLfsFile.toOption) yield shaHex -> lfsPath
87+
}
88+
}

bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ object CLIConfig {
7474
fileMatcher("delete-folders").text("delete folders with the specified names (eg '.svn', '*-tmp' - matches on folder name, not path within repo)").action {
7575
(v, c) => c.copy(deleteFolders = Some(v))
7676
}
77+
opt[Unit]("convert-to-git-lfs").text("experimental support for Git LFS, use with '-fi' to specify files").hidden().action {
78+
(_, c) => c.copy(lfsConversion = true)
79+
}
7780
opt[File]("replace-text").abbr("rt").valueName("<expressions-file>").text("filter content of files, replacing matched text. Match expressions should be listed in the file, one expression per line - " +
7881
"by default, each expression is treated as a literal, but 'regex:' & 'glob:' prefixes are supported, with '==>' to specify a replacement " +
7982
"string other than the default of '***REMOVED***'.").action {
@@ -129,6 +132,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
129132
filterSizeThreshold: Int = BlobTextModifier.DefaultSizeThreshold,
130133
textReplacementExpressions: Traversable[String] = List.empty,
131134
stripBlobsWithIds: Option[Set[ObjectId]] = None,
135+
lfsConversion: Boolean = false,
132136
strictObjectChecking: Boolean = false,
133137
sensitiveData: Option[Boolean] = None,
134138
massiveNonFileObjects: Option[Int] = None,
@@ -172,6 +176,16 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
172176
}
173177
}
174178

179+
lazy val lfsBlobConverter: Option[LfsBlobConverter] = if (lfsConversion) Some {
180+
new LfsBlobConverter {
181+
val lfsObjectsDir = repo.getDirectory / "lfs" / "objects"
182+
183+
val lfsSuitableFiles = filterContentPredicate
184+
185+
val threadLocalObjectDBResources = repo.getObjectDatabase.threadLocalResources
186+
}
187+
} else None
188+
175189
lazy val privateDataRemoval = sensitiveData.getOrElse(Seq(fileDeletion, folderDeletion, blobTextModifier).flatten.nonEmpty)
176190

177191
lazy val objectIdSubstitutor = if (privateDataRemoval) ObjectIdSubstitutor.OldIdsPrivate else ObjectIdSubstitutor.OldIdsPublic
@@ -209,7 +223,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None,
209223
}
210224
}
211225

212-
Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier).flatten
226+
Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier, lfsBlobConverter).flatten
213227
}
214228

215229
lazy val definesNoWork = treeBlobCleaners.isEmpty && folderDeletion.isEmpty && treeEntryListCleaners.isEmpty
Binary file not shown.

bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,13 @@
2020

2121
package com.madgag.git.bfg.cli
2222

23+
import com.madgag.git._
24+
import com.madgag.git.bfg.cli.test.unpackedRepo
2325
import org.specs2.mutable._
26+
27+
import scalax.file.ImplicitConversions._
2428
import scalax.file.Path
25-
import com.madgag.git._
26-
import bfg.cli.test.unpackedRepo
29+
2730

2831
class MainSpec extends Specification {
2932

@@ -52,6 +55,15 @@ class MainSpec extends Specification {
5255
}
5356
}
5457

58+
"convert big blobs to the Git LFS format" in new unpackedRepo("/sample-repos/repoWithBigBlobs.git.zip") {
59+
ensureRemovalOfBadEggs(packedBlobsOfSize(11238), contain(exactly(abbrId("596c")))) {
60+
run("--convert-to-git-lfs --filter-content-including *.png --no-blob-protection")
61+
}
62+
val lfsFile = repo.getDirectory / "lfs" / "objects" / "e0ebd49837a1cced34b9e7d3ff2fa68a8100df8f158f165ce139e366a941ba6e"
63+
64+
lfsFile.size must beSome(11238)
65+
}
66+
5567
"remove bad folder named '.git'" in new unpackedRepo("/sample-repos/badRepoContainingDotGitFolder.git.zip") {
5668
ensureRemovalOf(commitHistory(haveFolder(".git").atLeastOnce)) {
5769
run("--delete-folders .git --no-blob-protection")

0 commit comments

Comments
 (0)