TextHashFunctions.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.pgm.debug;

  11. import static java.lang.Integer.valueOf;
  12. import static java.lang.Long.valueOf;

  13. import java.io.File;
  14. import java.lang.reflect.Field;
  15. import java.security.MessageDigest;
  16. import java.util.ArrayList;
  17. import java.util.Arrays;
  18. import java.util.HashSet;
  19. import java.util.List;

  20. import org.eclipse.jgit.diff.RawText;
  21. import org.eclipse.jgit.diff.RawTextComparator;
  22. import org.eclipse.jgit.errors.LargeObjectException;
  23. import org.eclipse.jgit.lib.Constants;
  24. import org.eclipse.jgit.lib.FileMode;
  25. import org.eclipse.jgit.lib.MutableObjectId;
  26. import org.eclipse.jgit.lib.ObjectReader;
  27. import org.eclipse.jgit.lib.Repository;
  28. import org.eclipse.jgit.lib.RepositoryBuilder;
  29. import org.eclipse.jgit.lib.RepositoryCache;
  30. import org.eclipse.jgit.pgm.Command;
  31. import org.eclipse.jgit.pgm.TextBuiltin;
  32. import org.eclipse.jgit.pgm.internal.CLIText;
  33. import org.eclipse.jgit.revwalk.RevWalk;
  34. import org.eclipse.jgit.treewalk.TreeWalk;
  35. import org.eclipse.jgit.util.FS;
  36. import org.eclipse.jgit.util.NB;
  37. import org.kohsuke.args4j.Option;

  38. /**
  39.  * Scan repository to compute maximum number of collisions for hash functions.
  40.  *
  41.  * This is a test suite to help benchmark the collision rate of hash functions
  42.  * when applied to file contents in a Git repository. The test scans all text
  43.  * files in the HEAD revision of the repository it is run within. For each file
  44.  * it finds the unique lines, and then inserts those lines into a hash table to
  45.  * determine collision rates under the selected hash functions.
  46.  *
  47.  * To add another hash function to the test suite, declare a new instance member
  48.  * field of type {@link Hash} and implement the hashRegion method. The test
  49.  * suite will automatically pick up the new function through reflection.
  50.  *
  51.  * To add another folding function (method of squashing a 32 bit hash code into
  52.  * the hash tables smaller array index space), declare a new instance field of
  53.  * type {@link Fold} and implement the logic. The test suite will automatically
  54.  * pick up the new function through reflection.
  55.  */
  56. @Command(usage = "usage_TextHashFunctions")
  57. class TextHashFunctions extends TextBuiltin {

  58.     /** Standard SHA-1 on the line, using the first 4 bytes as the hash code. */
  59.     final Hash sha1 = new Hash() {
  60.         private final MessageDigest md = Constants.newMessageDigest();

  61.         @Override
  62.         protected int hashRegion(byte[] raw, int ptr, int end) {
  63.             md.reset();
  64.             md.update(raw, ptr, end - ptr);
  65.             return NB.decodeInt32(md.digest(), 0);
  66.         }
  67.     };

  68.     /** Professor Daniel J. Bernstein's rather popular string hash. */
  69.     final Hash djb = new Hash() {
  70.         @Override
  71.         protected int hashRegion(byte[] raw, int ptr, int end) {
  72.             int hash = 5381;
  73.             for (; ptr < end; ptr++)
  74.                 hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
  75.             return hash;
  76.         }
  77.     };

  78.     /** Hash function commonly used by java.lang.String. */
  79.     final Hash string_hash31 = new Hash() {
  80.         @Override
  81.         protected int hashRegion(byte[] raw, int ptr, int end) {
  82.             int hash = 0;
  83.             for (; ptr < end; ptr++)
  84.                 hash = 31 * hash + (raw[ptr] & 0xff);
  85.             return hash;
  86.         }
  87.     };

  88.     /** The Rabin polynomial hash that is used by our own DeltaIndex. */
  89.     final Hash rabin_DeltaIndex = new Hash() {
  90.         private final byte[] buf16 = new byte[16];

  91.         @Override
  92.         protected int hashRegion(byte[] raw, int ptr, int end) {
  93.             if (end - ptr < 16) {
  94.                 Arrays.fill(buf16, (byte) 0);
  95.                 System.arraycopy(raw, ptr, buf16, 0, end - ptr);
  96.                 return rabin(buf16, 0);
  97.             }
  98.             return rabin(raw, ptr);
  99.         }

  100.         private int rabin(byte[] raw, int ptr) {
  101.             int hash;

  102.             // The first 4 steps collapse out into a 4 byte big-endian decode,
  103.             // with a larger right shift as we combined shift lefts together.
  104.             //
  105.             hash = ((raw[ptr] & 0xff) << 24) //
  106.                     | ((raw[ptr + 1] & 0xff) << 16) //
  107.                     | ((raw[ptr + 2] & 0xff) << 8) //
  108.                     | (raw[ptr + 3] & 0xff);
  109.             hash ^= T[hash >>> 31];

  110.             hash = ((hash << 8) | (raw[ptr + 4] & 0xff)) ^ T[hash >>> 23];
  111.             hash = ((hash << 8) | (raw[ptr + 5] & 0xff)) ^ T[hash >>> 23];
  112.             hash = ((hash << 8) | (raw[ptr + 6] & 0xff)) ^ T[hash >>> 23];
  113.             hash = ((hash << 8) | (raw[ptr + 7] & 0xff)) ^ T[hash >>> 23];

  114.             hash = ((hash << 8) | (raw[ptr + 8] & 0xff)) ^ T[hash >>> 23];
  115.             hash = ((hash << 8) | (raw[ptr + 9] & 0xff)) ^ T[hash >>> 23];
  116.             hash = ((hash << 8) | (raw[ptr + 10] & 0xff)) ^ T[hash >>> 23];
  117.             hash = ((hash << 8) | (raw[ptr + 11] & 0xff)) ^ T[hash >>> 23];

  118.             hash = ((hash << 8) | (raw[ptr + 12] & 0xff)) ^ T[hash >>> 23];
  119.             hash = ((hash << 8) | (raw[ptr + 13] & 0xff)) ^ T[hash >>> 23];
  120.             hash = ((hash << 8) | (raw[ptr + 14] & 0xff)) ^ T[hash >>> 23];
  121.             hash = ((hash << 8) | (raw[ptr + 15] & 0xff)) ^ T[hash >>> 23];

  122.             return hash;
  123.         }

  124.         private final int[] T = { 0x00000000, 0xd4c6b32d, 0x7d4bd577,
  125.                 0xa98d665a, 0x2e5119c3, 0xfa97aaee, 0x531accb4, 0x87dc7f99,
  126.                 0x5ca23386, 0x886480ab, 0x21e9e6f1, 0xf52f55dc, 0x72f32a45,
  127.                 0xa6359968, 0x0fb8ff32, 0xdb7e4c1f, 0x6d82d421, 0xb944670c,
  128.                 0x10c90156, 0xc40fb27b, 0x43d3cde2, 0x97157ecf, 0x3e981895,
  129.                 0xea5eabb8, 0x3120e7a7, 0xe5e6548a, 0x4c6b32d0, 0x98ad81fd,
  130.                 0x1f71fe64, 0xcbb74d49, 0x623a2b13, 0xb6fc983e, 0x0fc31b6f,
  131.                 0xdb05a842, 0x7288ce18, 0xa64e7d35, 0x219202ac, 0xf554b181,
  132.                 0x5cd9d7db, 0x881f64f6, 0x536128e9, 0x87a79bc4, 0x2e2afd9e,
  133.                 0xfaec4eb3, 0x7d30312a, 0xa9f68207, 0x007be45d, 0xd4bd5770,
  134.                 0x6241cf4e, 0xb6877c63, 0x1f0a1a39, 0xcbcca914, 0x4c10d68d,
  135.                 0x98d665a0, 0x315b03fa, 0xe59db0d7, 0x3ee3fcc8, 0xea254fe5,
  136.                 0x43a829bf, 0x976e9a92, 0x10b2e50b, 0xc4745626, 0x6df9307c,
  137.                 0xb93f8351, 0x1f8636de, 0xcb4085f3, 0x62cde3a9, 0xb60b5084,
  138.                 0x31d72f1d, 0xe5119c30, 0x4c9cfa6a, 0x985a4947, 0x43240558,
  139.                 0x97e2b675, 0x3e6fd02f, 0xeaa96302, 0x6d751c9b, 0xb9b3afb6,
  140.                 0x103ec9ec, 0xc4f87ac1, 0x7204e2ff, 0xa6c251d2, 0x0f4f3788,
  141.                 0xdb8984a5, 0x5c55fb3c, 0x88934811, 0x211e2e4b, 0xf5d89d66,
  142.                 0x2ea6d179, 0xfa606254, 0x53ed040e, 0x872bb723, 0x00f7c8ba,
  143.                 0xd4317b97, 0x7dbc1dcd, 0xa97aaee0, 0x10452db1, 0xc4839e9c,
  144.                 0x6d0ef8c6, 0xb9c84beb, 0x3e143472, 0xead2875f, 0x435fe105,
  145.                 0x97995228, 0x4ce71e37, 0x9821ad1a, 0x31accb40, 0xe56a786d,
  146.                 0x62b607f4, 0xb670b4d9, 0x1ffdd283, 0xcb3b61ae, 0x7dc7f990,
  147.                 0xa9014abd, 0x008c2ce7, 0xd44a9fca, 0x5396e053, 0x8750537e,
  148.                 0x2edd3524, 0xfa1b8609, 0x2165ca16, 0xf5a3793b, 0x5c2e1f61,
  149.                 0x88e8ac4c, 0x0f34d3d5, 0xdbf260f8, 0x727f06a2, 0xa6b9b58f,
  150.                 0x3f0c6dbc, 0xebcade91, 0x4247b8cb, 0x96810be6, 0x115d747f,
  151.                 0xc59bc752, 0x6c16a108, 0xb8d01225, 0x63ae5e3a, 0xb768ed17,
  152.                 0x1ee58b4d, 0xca233860, 0x4dff47f9, 0x9939f4d4, 0x30b4928e,
  153.                 0xe47221a3, 0x528eb99d, 0x86480ab0, 0x2fc56cea, 0xfb03dfc7,
  154.                 0x7cdfa05e, 0xa8191373, 0x01947529, 0xd552c604, 0x0e2c8a1b,
  155.                 0xdaea3936, 0x73675f6c, 0xa7a1ec41, 0x207d93d8, 0xf4bb20f5,
  156.                 0x5d3646af, 0x89f0f582, 0x30cf76d3, 0xe409c5fe, 0x4d84a3a4,
  157.                 0x99421089, 0x1e9e6f10, 0xca58dc3d, 0x63d5ba67, 0xb713094a,
  158.                 0x6c6d4555, 0xb8abf678, 0x11269022, 0xc5e0230f, 0x423c5c96,
  159.                 0x96faefbb, 0x3f7789e1, 0xebb13acc, 0x5d4da2f2, 0x898b11df,
  160.                 0x20067785, 0xf4c0c4a8, 0x731cbb31, 0xa7da081c, 0x0e576e46,
  161.                 0xda91dd6b, 0x01ef9174, 0xd5292259, 0x7ca44403, 0xa862f72e,
  162.                 0x2fbe88b7, 0xfb783b9a, 0x52f55dc0, 0x8633eeed, 0x208a5b62,
  163.                 0xf44ce84f, 0x5dc18e15, 0x89073d38, 0x0edb42a1, 0xda1df18c,
  164.                 0x739097d6, 0xa75624fb, 0x7c2868e4, 0xa8eedbc9, 0x0163bd93,
  165.                 0xd5a50ebe, 0x52797127, 0x86bfc20a, 0x2f32a450, 0xfbf4177d,
  166.                 0x4d088f43, 0x99ce3c6e, 0x30435a34, 0xe485e919, 0x63599680,
  167.                 0xb79f25ad, 0x1e1243f7, 0xcad4f0da, 0x11aabcc5, 0xc56c0fe8,
  168.                 0x6ce169b2, 0xb827da9f, 0x3ffba506, 0xeb3d162b, 0x42b07071,
  169.                 0x9676c35c, 0x2f49400d, 0xfb8ff320, 0x5202957a, 0x86c42657,
  170.                 0x011859ce, 0xd5deeae3, 0x7c538cb9, 0xa8953f94, 0x73eb738b,
  171.                 0xa72dc0a6, 0x0ea0a6fc, 0xda6615d1, 0x5dba6a48, 0x897cd965,
  172.                 0x20f1bf3f, 0xf4370c12, 0x42cb942c, 0x960d2701, 0x3f80415b,
  173.                 0xeb46f276, 0x6c9a8def, 0xb85c3ec2, 0x11d15898, 0xc517ebb5,
  174.                 0x1e69a7aa, 0xcaaf1487, 0x632272dd, 0xb7e4c1f0, 0x3038be69,
  175.                 0xe4fe0d44, 0x4d736b1e, 0x99b5d833 };
  176.     };

  177.     /** Bitwise-and to extract only the low bits. */
  178.     final Fold truncate = new Fold() {
  179.         @Override
  180.         public int fold(int hash, int bits) {
  181.             return hash & ((1 << bits) - 1);
  182.         }
  183.     };

  184.     /** Applies the golden ratio and takes the upper bits. */
  185.     final Fold golden_ratio = new Fold() {
  186.         @Override
  187.         public int fold(int hash, int bits) {
  188.             /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
  189.             return (hash * 0x9e370001) >>> (32 - bits);
  190.         }
  191.     };

  192.     // -----------------------------------------------------------------------
  193.     //
  194.     // Implementation of the suite lives below this line.
  195.     //
  196.     //

  197.     @Option(name = "--hash", metaVar = "NAME", usage = "Enable hash function(s)")
  198.     List<String> hashFunctions = new ArrayList<>();

  199.     @Option(name = "--fold", metaVar = "NAME", usage = "Enable fold function(s)")
  200.     List<String> foldFunctions = new ArrayList<>();

  201.     @Option(name = "--text-limit", metaVar = "LIMIT", usage = "Maximum size in KiB to scan")
  202.     int textLimit = 15 * 1024; // 15 MiB as later we do * 1024.

  203.     @Option(name = "--repository", aliases = { "-r" }, metaVar = "GIT_DIR", usage = "Repository to scan")
  204.     List<File> gitDirs = new ArrayList<>();

  205.     /** {@inheritDoc} */
  206.     @Override
  207.     protected boolean requiresRepository() {
  208.         return false;
  209.     }

  210.     /** {@inheritDoc} */
  211.     @Override
  212.     protected void run() throws Exception {
  213.         if (gitDirs.isEmpty()) {
  214.             RepositoryBuilder rb = new RepositoryBuilder() //
  215.                     .setGitDir(new File(gitdir)) //
  216.                     .readEnvironment() //
  217.                     .findGitDir();
  218.             if (rb.getGitDir() == null)
  219.                 throw die(CLIText.get().cantFindGitDirectory);
  220.             gitDirs.add(rb.getGitDir());
  221.         }

  222.         for (File dir : gitDirs) {
  223.             RepositoryBuilder rb = new RepositoryBuilder();
  224.             if (RepositoryCache.FileKey.isGitRepository(dir, FS.DETECTED))
  225.                 rb.setGitDir(dir);
  226.             else
  227.                 rb.findGitDir(dir);

  228.             try (Repository repo = rb.build()) {
  229.                 run(repo);
  230.             }
  231.         }
  232.     }

  233.     private void run(Repository repo) throws Exception {
  234.         List<Function> all = init();

  235.         long fileCnt = 0;
  236.         long lineCnt = 0;
  237.         try (ObjectReader or = repo.newObjectReader();
  238.             RevWalk rw = new RevWalk(or);
  239.             TreeWalk tw = new TreeWalk(or)) {
  240.             final MutableObjectId id = new MutableObjectId();
  241.             tw.reset(rw.parseTree(repo.resolve(Constants.HEAD)));
  242.             tw.setRecursive(true);

  243.             while (tw.next()) {
  244.                 FileMode fm = tw.getFileMode(0);
  245.                 if (!FileMode.REGULAR_FILE.equals(fm)
  246.                         && !FileMode.EXECUTABLE_FILE.equals(fm))
  247.                     continue;

  248.                 byte[] raw;
  249.                 try {
  250.                     tw.getObjectId(id, 0);
  251.                     raw = or.open(id).getCachedBytes(textLimit * 1024);
  252.                 } catch (LargeObjectException tooBig) {
  253.                     continue;
  254.                 }

  255.                 if (RawText.isBinary(raw, raw.length, true))
  256.                     continue;

  257.                 RawText txt = new RawText(raw);
  258.                 int[] lines = new int[txt.size()];
  259.                 int cnt = 0;
  260.                 HashSet<Line> u = new HashSet<>();
  261.                 for (int i = 0; i < txt.size(); i++) {
  262.                     if (u.add(new Line(txt, i)))
  263.                         lines[cnt++] = i;
  264.                 }

  265.                 fileCnt++;
  266.                 lineCnt += cnt;

  267.                 for (Function fun : all)
  268.                     testOne(fun, txt, lines, cnt);
  269.             }
  270.         }

  271.         File directory = repo.getDirectory();
  272.         if (directory != null) {
  273.             String name = directory.getName();
  274.             File parent = directory.getParentFile();
  275.             if (name.equals(Constants.DOT_GIT) && parent != null)
  276.                 name = parent.getName();
  277.             outw.println(name + ":"); //$NON-NLS-1$
  278.         }
  279.         outw.format("  %6d files; %5d avg. unique lines/file\n", //$NON-NLS-1$
  280.                 valueOf(fileCnt), //
  281.                 valueOf(lineCnt / fileCnt));
  282.         outw.format("%-20s %-15s %9s\n", "Hash", "Fold", "Max Len"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
  283.         outw.println("-----------------------------------------------"); //$NON-NLS-1$
  284.         String lastHashName = null;
  285.         for (Function fun : all) {
  286.             String hashName = fun.hash.name;
  287.             if (hashName.equals(lastHashName))
  288.                 hashName = ""; //$NON-NLS-1$
  289.             outw.format("%-20s %-15s %9d\n", // //$NON-NLS-1$
  290.                     hashName, //
  291.                     fun.fold.name, //
  292.                     valueOf(fun.maxChainLength));
  293.             lastHashName = fun.hash.name;
  294.         }
  295.         outw.println();
  296.         outw.flush();
  297.     }

  298.     private static void testOne(Function fun, RawText txt, int[] elements,
  299.             int cnt) {
  300.         final Hash cmp = fun.hash;
  301.         final Fold fold = fun.fold;

  302.         final int bits = tableBits(cnt);
  303.         final int[] buckets = new int[1 << bits];
  304.         for (int i = 0; i < cnt; i++)
  305.             buckets[fold.fold(cmp.hash(txt, elements[i]), bits)]++;

  306.         int maxChainLength = 0;
  307.         for (int i = 0; i < buckets.length; i++)
  308.             maxChainLength = Math.max(maxChainLength, buckets[i]);
  309.         fun.maxChainLength = Math.max(fun.maxChainLength, maxChainLength);
  310.     }

  311.     private List<Function> init() {
  312.         List<Hash> hashes = new ArrayList<>();
  313.         List<Fold> folds = new ArrayList<>();

  314.         try {
  315.             for (Field f : TextHashFunctions.class.getDeclaredFields()) {
  316.                 if (f.getType() == Hash.class) {
  317.                     f.setAccessible(true);
  318.                     Hash cmp = (Hash) f.get(this);
  319.                     cmp.name = f.getName();
  320.                     hashes.add(cmp);

  321.                 } else if (f.getType() == Fold.class) {
  322.                     f.setAccessible(true);
  323.                     Fold fold = (Fold) f.get(this);
  324.                     fold.name = f.getName();
  325.                     folds.add(fold);
  326.                 }
  327.             }
  328.         } catch (IllegalArgumentException | IllegalAccessException e) {
  329.             throw new RuntimeException("Cannot determine names", e); //$NON-NLS-1$
  330.         }

  331.         List<Function> all = new ArrayList<>();
  332.         for (Hash cmp : hashes) {
  333.             if (include(cmp.name, hashFunctions)) {
  334.                 for (Fold f : folds) {
  335.                     if (include(f.name, foldFunctions)) {
  336.                         all.add(new Function(cmp, f));
  337.                     }
  338.                 }
  339.             }
  340.         }
  341.         return all;
  342.     }

  343.     private static boolean include(String name, List<String> want) {
  344.         if (want.isEmpty())
  345.             return true;
  346.         for (String s : want) {
  347.             if (s.equalsIgnoreCase(name))
  348.                 return true;
  349.         }
  350.         return false;
  351.     }

  352.     private static class Function {
  353.         final Hash hash;

  354.         final Fold fold;

  355.         int maxChainLength;

  356.         Function(Hash cmp, Fold fold) {
  357.             this.hash = cmp;
  358.             this.fold = fold;
  359.         }
  360.     }

  361.     /** Base class for any hashCode function to be tested. */
  362.     private abstract static class Hash extends RawTextComparator {
  363.         String name;

  364.         @Override
  365.         public boolean equals(RawText a, int ai, RawText b, int bi) {
  366.             return RawTextComparator.DEFAULT.equals(a, ai, b, bi);
  367.         }
  368.     }

  369.     /** Base class for any hashCode folding function to be tested. */
  370.     private abstract static class Fold {
  371.         String name;

  372.         /**
  373.          * Fold the given 32-bit hash code into only {@code bits} of space.
  374.          *
  375.          * @param hash
  376.          *            the 32 bit hash code to be folded into a smaller value.
  377.          * @param bits
  378.          *            total number of bits that can appear in the output. The
  379.          *            output value must be in the range {@code [0, 1 << bits)}.
  380.          *            When bits = 2, valid outputs are 0, 1, 2, 3.
  381.          * @return the folded hash, squeezed into only {@code bits}.
  382.          */
  383.         abstract int fold(int hash, int bits);
  384.     }

  385.     /** Utility to help us identify unique lines in a file. */
  386.     private static class Line {
  387.         private final RawText txt;

  388.         private final int pos;

  389.         Line(RawText txt, int pos) {
  390.             this.txt = txt;
  391.             this.pos = pos;
  392.         }

  393.         @Override
  394.         public int hashCode() {
  395.             return RawTextComparator.DEFAULT.hash(txt, pos);
  396.         }

  397.         @Override
  398.         public boolean equals(Object obj) {
  399.             if (obj instanceof Line) {
  400.                 Line e = (Line) obj;
  401.                 return RawTextComparator.DEFAULT.equals(txt, pos, e.txt, e.pos);
  402.             }
  403.             return false;
  404.         }
  405.     }

  406.     private static int tableBits(int sz) {
  407.         int bits = 31 - Integer.numberOfLeadingZeros(sz);
  408.         if (bits == 0)
  409.             bits = 1;
  410.         if (1 << bits < sz)
  411.             bits++;
  412.         return bits;
  413.     }
  414. }