Info command: Show histogram of key/value sizes and number of keys per table (#730)

* Add ShowKeyValueHistogram method

This commit adds ShowKeyValueHistogram method and types associated with it.

Signed-off-by: Ibrahim Jarif <[email protected]>

* Remove whitespace

Signed-off-by: Ibrahim Jarif <[email protected]>

* Fix comment

Signed-off-by: Ibrahim Jarif <[email protected]>

* Move histogram code to histogram.go and add histogram_test.go file

Signed-off-by: Ibrahim Jarif <[email protected]>

* Show number of keys present per SSTable

The output of `badger info` command now shows the number of keys per SSTable

Signed-off-by: Ibrahim Jarif <[email protected]>

* Rename ShowKeyValueSizeHistogram to PrintKeyValueHistogram

Signed-off-by: Ibrahim Jarif <[email protected]>
diff --git a/badger/cmd/info.go b/badger/cmd/info.go
index fd3ef92..b84fd6d 100644
--- a/badger/cmd/info.go
+++ b/badger/cmd/info.go
@@ -33,6 +33,23 @@
 	"github.com/spf13/cobra"
 )
 
+type flagOptions struct {
+	showTables    bool
+	sizeHistogram bool
+}
+
+var (
+	opt flagOptions
+)
+
+func init() {
+	RootCmd.AddCommand(infoCmd)
+	infoCmd.Flags().BoolVarP(&opt.showTables, "show-tables", "s", false,
+		"If set to true, show tables as well.")
+	infoCmd.Flags().BoolVar(&opt.sizeHistogram, "histogram", false,
+		"Show a histogram of the key and value sizes.")
+}
+
 var infoCmd = &cobra.Command{
 	Use:   "info",
 	Short: "Health info about Badger database.",
@@ -48,25 +65,35 @@
 			fmt.Println("Error:", err.Error())
 			os.Exit(1)
 		}
-		if !showTables {
+		if !opt.showTables {
 			return
 		}
-		err = tableInfo(sstDir, vlogDir)
+		// Open DB
+		opts := badger.DefaultOptions
+		opts.TableLoadingMode = options.MemoryMap
+		opts.Dir = sstDir
+		opts.ValueDir = vlogDir
+		opts.ReadOnly = true
+
+		db, err := badger.Open(opts)
 		if err != nil {
 			fmt.Println("Error:", err.Error())
 			os.Exit(1)
 		}
+		defer db.Close()
+
+		err = tableInfo(sstDir, vlogDir, db)
+		if err != nil {
+			fmt.Println("Error:", err.Error())
+			os.Exit(1)
+		}
+		if opt.sizeHistogram {
+			// use prefix as nil since we want to list all keys
+			db.ShowKeyValueSizeHistogram(nil)
+		}
 	},
 }
 
-var showTables bool
-
-func init() {
-	RootCmd.AddCommand(infoCmd)
-	infoCmd.Flags().BoolVarP(&showTables, "show-tables", "s", false,
-		"If set to true, show tables as well.")
-}
-
 func hbytes(sz int64) string {
 	return humanize.Bytes(uint64(sz))
 }
@@ -75,27 +102,20 @@
 	return humanize.RelTime(dst, src, "earlier", "later")
 }
 
-func tableInfo(dir, valueDir string) error {
-	// Open DB
-	opts := badger.DefaultOptions
-	opts.TableLoadingMode = options.MemoryMap
-	opts.Dir = sstDir
-	opts.ValueDir = vlogDir
-	opts.ReadOnly = true
-
-	db, err := badger.Open(opts)
-	if err != nil {
-		return err
-	}
-	defer db.Close()
-
+func tableInfo(dir, valueDir string, db *badger.DB) error {
 	tables := db.Tables()
+	fmt.Printf("\n%s SSTables %[1]s\n", strings.Repeat("=", 45))
+	fmt.Printf("%-5s\t%-10s\t%-30s\t%-30s\t%-7s\n", "ID", "Level",
+		"Left-Key(in hex) (Time)", "Right-Key(in hex) (Time)", "Total Keys")
+	fmt.Printf("%s\n", strings.Repeat("=", 100))
 	for _, t := range tables {
-		lk, lv := y.ParseKey(t.Left), y.ParseTs(t.Left)
-		rk, rv := y.ParseKey(t.Right), y.ParseTs(t.Right)
-		fmt.Printf("SSTable [L%d, %03d] [%20X, v%-10d -> %20X, v%-10d]\n",
-			t.Level, t.ID, lk, lv, rk, rv)
+		lk, lt := y.ParseKey(t.Left), y.ParseTs(t.Left)
+		rk, rt := y.ParseKey(t.Right), y.ParseTs(t.Right)
+
+		fmt.Printf("%-5d\tL%-9d\t%-30s\t%-30s\t%-7d\n", t.ID, t.Level,
+			fmt.Sprintf("%X (v%d)", lk, lt), fmt.Sprintf("%X (v%d)", rk, rt), t.KeyCount)
 	}
+	fmt.Println()
 	return nil
 }
 
@@ -135,7 +155,6 @@
 
 	fmt.Println()
 	var baseTime time.Time
-	// fmt.Print("\n[Manifest]\n")
 	manifestTruncated := false
 	manifestInfo, ok := fileinfoByName[badger.ManifestFilename]
 	if ok {
diff --git a/histogram.go b/histogram.go
new file mode 100644
index 0000000..0d46439
--- /dev/null
+++ b/histogram.go
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2019 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package badger
+
+import (
+	"fmt"
+	"math"
+)
+
+// PrintKeyValueHistogram builds and displays the key-value size histogram.
+// When keyPrefix is set, only the keys that have prefix "keyPrefix" are
+// considered for creating the histogram
+func (db *DB) PrintKeyValueHistogram(keyPrefix []byte) {
+	if db == nil {
+		fmt.Println("\nCannot build histogram: DB is nil.")
+		return
+	}
+	histogram := db.buildKeyValueSizeHistogram(keyPrefix)
+	fmt.Printf("Histogram of key sizes (in bytes)\n")
+	histogram.keySizeHistogram.printHistogram()
+	fmt.Printf("Histogram of value sizes (in bytes)\n")
+	histogram.valueSizeHistogram.printHistogram()
+}
+
+// histogramData stores information about a histogram
+type histogramData struct {
+	bins        []int64
+	countPerBin []int64
+	totalCount  int64
+	min         int64
+	max         int64
+	sum         int64
+}
+
+// keyValueSizeHistogram contains keySize histogram and valueSize histogram
+type keyValueSizeHistogram struct {
+	keySizeHistogram, valueSizeHistogram histogramData
+}
+
+// newKeyValueSizeHistogram returns a new instance of keyValueSizeHistogram with
+// properly initialized fields.
+func newKeyValueSizeHistogram() *keyValueSizeHistogram {
+	// TODO(ibrahim): find appropriate bin size.
+	keyBins := createHistogramBins(1, 16)
+	valueBins := createHistogramBins(1, 30)
+	return &keyValueSizeHistogram{
+		keySizeHistogram: histogramData{
+			bins:        keyBins,
+			countPerBin: make([]int64, len(keyBins)+1),
+			max:         math.MinInt64,
+			min:         math.MaxInt64,
+			sum:         0,
+		},
+		valueSizeHistogram: histogramData{
+			bins:        valueBins,
+			countPerBin: make([]int64, len(valueBins)+1),
+			max:         math.MinInt64,
+			min:         math.MaxInt64,
+			sum:         0,
+		},
+	}
+}
+
+// createHistogramBins creates bins for an histogram. The bin sizes are powers
+// of two of the form [2^min_exponent, ..., 2^max_exponent].
+func createHistogramBins(minExponent, maxExponent uint32) []int64 {
+	var bins []int64
+	for i := minExponent; i <= maxExponent; i++ {
+		bins = append(bins, int64(1)<<i)
+	}
+	return bins
+}
+
+// Update the min and max fields if value is less than or greater than the
+// current min/max value.
+func (histogram *histogramData) Update(value int64) {
+	if value > histogram.max {
+		histogram.max = value
+	}
+	if value < histogram.min {
+		histogram.min = value
+	}
+
+	histogram.sum += value
+	histogram.totalCount++
+
+	for index := 0; index <= len(histogram.bins); index++ {
+		// Allocate value in the last buckets if we reached the end of the Bounds array.
+		if index == len(histogram.bins) {
+			histogram.countPerBin[index]++
+			break
+		}
+
+		// Check if the value should be added to the "index" bin
+		if value < int64(histogram.bins[index]) {
+			histogram.countPerBin[index]++
+			break
+		}
+	}
+}
+
+// buildKeyValueSizeHistogram builds the key-value size histogram.
+// When keyPrefix is set, only the keys that have prefix "keyPrefix" are
+// considered for creating the histogram
+func (db *DB) buildKeyValueSizeHistogram(keyPrefix []byte) *keyValueSizeHistogram {
+	txn := db.NewTransaction(false)
+	defer txn.Discard()
+
+	itr := txn.NewIterator(DefaultIteratorOptions)
+	defer itr.Close()
+
+	badgerHistogram := newKeyValueSizeHistogram()
+
+	// Collect key and value sizes.
+	for itr.Seek(keyPrefix); itr.ValidForPrefix(keyPrefix); itr.Next() {
+		item := itr.Item()
+		badgerHistogram.keySizeHistogram.Update(item.KeySize())
+		badgerHistogram.valueSizeHistogram.Update(item.ValueSize())
+	}
+	return badgerHistogram
+}
+
+// printHistogram prints the histogram data in a human-readable format.
+func (histogram histogramData) printHistogram() {
+	fmt.Printf("Total count: %d\n", histogram.totalCount)
+	fmt.Printf("Min value: %d\n", histogram.min)
+	fmt.Printf("Max value: %d\n", histogram.max)
+	fmt.Printf("Mean: %.2f\n", float64(histogram.sum)/float64(histogram.totalCount))
+	fmt.Printf("%24s %9s\n", "Range", "Count")
+
+	numBins := len(histogram.bins)
+	for index, count := range histogram.countPerBin {
+		if count == 0 {
+			continue
+		}
+
+		// The last bin represents the bin that contains the range from
+		// the last bin up to infinity so it's processed differently than the
+		// other bins.
+		if index == len(histogram.countPerBin)-1 {
+			lowerBound := int(histogram.bins[numBins-1])
+			fmt.Printf("[%10d, %10s) %9d\n", lowerBound, "infinity", count)
+			continue
+		}
+
+		upperBound := int(histogram.bins[index])
+		lowerBound := 0
+		if index > 0 {
+			lowerBound = int(histogram.bins[index-1])
+		}
+
+		fmt.Printf("[%10d, %10d) %9d\n", lowerBound, upperBound, count)
+	}
+	fmt.Println()
+}
diff --git a/histogram_test.go b/histogram_test.go
new file mode 100644
index 0000000..240fca6
--- /dev/null
+++ b/histogram_test.go
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2019 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package badger
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestBuildKeyValueSizeHistogram(t *testing.T) {
+	t.Run("All same size key-values", func(t *testing.T) {
+		runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+			entries := int64(40)
+			err := db.Update(func(txn *Txn) error {
+				for i := int64(0); i < entries; i++ {
+					err := txn.Set([]byte(string(i)), []byte("B"))
+					if err != nil {
+						return err
+					}
+				}
+				return nil
+			})
+			require.NoError(t, err)
+
+			histogram := db.buildKeyValueSizeHistogram(nil)
+			keyHistogram := histogram.keySizeHistogram
+			valueHistogram := histogram.valueSizeHistogram
+
+			require.Equal(t, entries, keyHistogram.totalCount)
+			require.Equal(t, entries, valueHistogram.totalCount)
+
+			// Each entry is of size one. So the sum of sizes should be the same
+			// as number of entries
+			require.Equal(t, entries, valueHistogram.sum)
+			require.Equal(t, entries, keyHistogram.sum)
+
+			// All value sizes are same. The first bin should have all the values.
+			require.Equal(t, entries, valueHistogram.countPerBin[0])
+			require.Equal(t, entries, keyHistogram.countPerBin[0])
+
+			require.Equal(t, int64(1), keyHistogram.max)
+			require.Equal(t, int64(1), keyHistogram.min)
+			require.Equal(t, int64(1), valueHistogram.max)
+			require.Equal(t, int64(1), valueHistogram.min)
+		})
+	})
+
+	t.Run("different size key-values", func(t *testing.T) {
+		runBadgerTest(t, nil, func(t *testing.T, db *DB) {
+			entries := int64(3)
+			err := db.Update(func(txn *Txn) error {
+				if err := txn.Set([]byte("A"), []byte("B")); err != nil {
+					return err
+				}
+
+				if err := txn.Set([]byte("AA"), []byte("BB")); err != nil {
+					return err
+				}
+
+				if err := txn.Set([]byte("AAA"), []byte("BBB")); err != nil {
+					return err
+				}
+				return nil
+			})
+			require.NoError(t, err)
+
+			histogram := db.buildKeyValueSizeHistogram(nil)
+			keyHistogram := histogram.keySizeHistogram
+			valueHistogram := histogram.valueSizeHistogram
+
+			require.Equal(t, entries, keyHistogram.totalCount)
+			require.Equal(t, entries, valueHistogram.totalCount)
+
+			// Each entry is of size one. So the sum of sizes should be the same
+			// as number of entries
+			require.Equal(t, int64(6), valueHistogram.sum)
+			require.Equal(t, int64(6), keyHistogram.sum)
+
+			// Lenght 1 key is in first bucket, length 2 and 3 are in the second
+			// bucket
+			require.Equal(t, int64(1), valueHistogram.countPerBin[0])
+			require.Equal(t, int64(2), valueHistogram.countPerBin[1])
+			require.Equal(t, int64(1), keyHistogram.countPerBin[0])
+			require.Equal(t, int64(2), keyHistogram.countPerBin[1])
+
+			require.Equal(t, int64(3), keyHistogram.max)
+			require.Equal(t, int64(1), keyHistogram.min)
+			require.Equal(t, int64(3), valueHistogram.max)
+			require.Equal(t, int64(1), valueHistogram.min)
+		})
+	})
+}
diff --git a/iterator.go b/iterator.go
index 7e79266..f6df716 100644
--- a/iterator.go
+++ b/iterator.go
@@ -246,6 +246,12 @@
 	return int64(vp.Len) // includes key length.
 }
 
+// KeySize returns the size of the key.
+// Exact size of the key is key + 8 bytes of timestamp
+func (item *Item) KeySize() int64 {
+	return int64(len(item.key))
+}
+
 // ValueSize returns the exact size of the value.
 //
 // This can be called to quickly estimate the size of a value without fetching
diff --git a/levels.go b/levels.go
index 9c8a490..9d2b501 100644
--- a/levels.go
+++ b/levels.go
@@ -945,20 +945,27 @@
 
 // TableInfo represents the information about a table.
 type TableInfo struct {
-	ID    uint64
-	Level int
-	Left  []byte
-	Right []byte
+	ID       uint64
+	Level    int
+	Left     []byte
+	Right    []byte
+	KeyCount uint64 // Number of keys in the table
 }
 
 func (s *levelsController) getTableInfo() (result []TableInfo) {
 	for _, l := range s.levels {
 		for _, t := range l.tables {
+			it := t.NewIterator(false)
+			var count uint64
+			for it.Rewind(); it.Valid(); it.Next() {
+				count++
+			}
 			info := TableInfo{
-				ID:    t.ID(),
-				Level: l.level,
-				Left:  t.Smallest(),
-				Right: t.Biggest(),
+				ID:       t.ID(),
+				Level:    l.level,
+				Left:     t.Smallest(),
+				Right:    t.Biggest(),
+				KeyCount: count,
 			}
 			result = append(result, info)
 		}
diff --git a/table/table.go b/table/table.go
index 4e57b91..5ec2b30 100644
--- a/table/table.go
+++ b/table/table.go
@@ -59,7 +59,7 @@
 	tableSize int      // Initialized in OpenTable, using fd.Stat().
 
 	blockIndex []keyOffset
-	ref        int32 // For file garbage collection.  Atomic.
+	ref        int32 // For file garbage collection. Atomic.
 
 	loadingMode options.FileLoadingMode
 	mmap        []byte // Memory mapped.