levels.go - external/github.com/dgraph-io/badger - Git at Google

 /*
  * Copyright 2017 Dgraph Labs, Inc. and Contributors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package badger

 import (
 	"bytes"
 	"context"
 	"encoding/hex"
 	stderrors "errors"
 	"fmt"
 	"math"
 	"math/rand"
 	"os"
 	"sort"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"time"

 	"github.com/pkg/errors"
 	otrace "go.opencensus.io/trace"

 	"github.com/dgraph-io/badger/v4/options"
 	"github.com/dgraph-io/badger/v4/pb"
 	"github.com/dgraph-io/badger/v4/table"
 	"github.com/dgraph-io/badger/v4/y"
 	"github.com/dgraph-io/ristretto/v2/z"
 )

 type levelsController struct {
 	nextFileID atomic.Uint64
 	l0stallsMs atomic.Int64

 	// The following are initialized once and const.
 	levels []*levelHandler
 	kv     *DB

 	cstatus compactStatus
 }

 // revertToManifest checks that all necessary table files exist and removes all table files not
 // referenced by the manifest. idMap is a set of table file id's that were read from the directory
 // listing.
 func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error {
 	// 1. Check all files in manifest exist.
 	for id := range mf.Tables {
 		if _, ok := idMap[id]; !ok {
 			return fmt.Errorf("file does not exist for table %d", id)
 		}
 	}

 	// 2. Delete files that shouldn't exist.
 	for id := range idMap {
 		if _, ok := mf.Tables[id]; !ok {
 			kv.opt.Debugf("Table file %d not referenced in MANIFEST\n", id)
 			filename := table.NewFilename(id, kv.opt.Dir)
 			if err := os.Remove(filename); err != nil {
 				return y.Wrapf(err, "While removing table %d", id)
 			}
 		}
 	}

 	return nil
 }

 func newLevelsController(db *DB, mf *Manifest) (*levelsController, error) {
 	y.AssertTrue(db.opt.NumLevelZeroTablesStall > db.opt.NumLevelZeroTables)
 	s := &levelsController{
 		kv:     db,
 		levels: make([]*levelHandler, db.opt.MaxLevels),
 	}
 	s.cstatus.tables = make(map[uint64]struct{})
 	s.cstatus.levels = make([]*levelCompactStatus, db.opt.MaxLevels)

 	for i := 0; i < db.opt.MaxLevels; i++ {
 		s.levels[i] = newLevelHandler(db, i)
 		s.cstatus.levels[i] = new(levelCompactStatus)
 	}

 	if db.opt.InMemory {
 		return s, nil
 	}
 	// Compare manifest against directory, check for existent/non-existent files, and remove.
 	if err := revertToManifest(db, mf, getIDMap(db.opt.Dir)); err != nil {
 		return nil, err
 	}

 	var mu sync.Mutex
 	tables := make([][]*table.Table, db.opt.MaxLevels)
 	var maxFileID uint64

 	// We found that using 3 goroutines allows disk throughput to be utilized to its max.
 	// Disk utilization is the main thing we should focus on, while trying to read the data. That's
 	// the one factor that remains constant between HDD and SSD.
 	throttle := y.NewThrottle(3)

 	start := time.Now()
 	var numOpened atomic.Int32
 	tick := time.NewTicker(3 * time.Second)
 	defer tick.Stop()

 	for fileID, tf := range mf.Tables {
 		fname := table.NewFilename(fileID, db.opt.Dir)
 		select {
 		case <-tick.C:
 			db.opt.Infof("%d tables out of %d opened in %s\n", numOpened.Load(),
 				len(mf.Tables), time.Since(start).Round(time.Millisecond))
 		default:
 		}
 		if err := throttle.Do(); err != nil {
 			closeAllTables(tables)
 			return nil, err
 		}
 		if fileID > maxFileID {
 			maxFileID = fileID
 		}
 		go func(fname string, tf TableManifest) {
 			var rerr error
 			defer func() {
 				throttle.Done(rerr)
 				numOpened.Add(1)
 			}()
 			dk, err := db.registry.DataKey(tf.KeyID)
 			if err != nil {
 				rerr = y.Wrapf(err, "Error while reading datakey")
 				return
 			}
 			topt := buildTableOptions(db)
 			// Explicitly set Compression and DataKey based on how the table was generated.
 			topt.Compression = tf.Compression
 			topt.DataKey = dk

 			mf, err := z.OpenMmapFile(fname, db.opt.getFileFlags(), 0)
 			if err != nil {
 				rerr = y.Wrapf(err, "Opening file: %q", fname)
 				return
 			}
 			t, err := table.OpenTable(mf, topt)
 			if err != nil {
 				if strings.HasPrefix(err.Error(), "CHECKSUM_MISMATCH:") {
 					db.opt.Errorf(err.Error())
 					db.opt.Errorf("Ignoring table %s", mf.Fd.Name())
 					// Do not set rerr. We will continue without this table.
 				} else {
 					rerr = y.Wrapf(err, "Opening table: %q", fname)
 				}
 				return
 			}

 			mu.Lock()
 			tables[tf.Level] = append(tables[tf.Level], t)
 			mu.Unlock()
 		}(fname, tf)
 	}
 	if err := throttle.Finish(); err != nil {
 		closeAllTables(tables)
 		return nil, err
 	}
 	db.opt.Infof("All %d tables opened in %s\n", numOpened.Load(),
 		time.Since(start).Round(time.Millisecond))
 	s.nextFileID.Store(maxFileID + 1)
 	for i, tbls := range tables {
 		s.levels[i].initTables(tbls)
 	}

 	// Make sure key ranges do not overlap etc.
 	if err := s.validate(); err != nil {
 		_ = s.cleanupLevels()
 		return nil, y.Wrap(err, "Level validation")
 	}

 	// Sync directory (because we have at least removed some files, or previously created the
 	// manifest file).
 	if err := syncDir(db.opt.Dir); err != nil {
 		_ = s.close()
 		return nil, err
 	}

 	return s, nil
 }

 // Closes the tables, for cleanup in newLevelsController.  (We Close() instead of using DecrRef()
 // because that would delete the underlying files.)  We ignore errors, which is OK because tables
 // are read-only.
 func closeAllTables(tables [][]*table.Table) {
 	for _, tableSlice := range tables {
 		for _, table := range tableSlice {
 			_ = table.Close(-1)
 		}
 	}
 }

 func (s *levelsController) cleanupLevels() error {
 	var firstErr error
 	for _, l := range s.levels {
 		if err := l.close(); err != nil && firstErr == nil {
 			firstErr = err
 		}
 	}
 	return firstErr
 }

 // dropTree picks all tables from all levels, creates a manifest changeset,
 // applies it, and then decrements the refs of these tables, which would result
 // in their deletion.
 func (s *levelsController) dropTree() (int, error) {
 	// First pick all tables, so we can create a manifest changelog.
 	var all []*table.Table
 	for _, l := range s.levels {
 		l.RLock()
 		all = append(all, l.tables...)
 		l.RUnlock()
 	}
 	if len(all) == 0 {
 		return 0, nil
 	}

 	// Generate the manifest changes.
 	changes := []*pb.ManifestChange{}
 	for _, table := range all {
 		// Add a delete change only if the table is not in memory.
 		if !table.IsInmemory {
 			changes = append(changes, newDeleteChange(table.ID()))
 		}
 	}
 	changeSet := pb.ManifestChangeSet{Changes: changes}
 	if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil {
 		return 0, err
 	}

 	// Now that manifest has been successfully written, we can delete the tables.
 	for _, l := range s.levels {
 		l.Lock()
 		l.totalSize = 0
 		l.tables = l.tables[:0]
 		l.Unlock()
 	}
 	for _, table := range all {
 		if err := table.DecrRef(); err != nil {
 			return 0, err
 		}
 	}
 	return len(all), nil
 }

 // dropPrefix runs a L0->L1 compaction, and then runs same level compaction on the rest of the
 // levels. For L0->L1 compaction, it runs compactions normally, but skips over
 // all the keys with the provided prefix.
 // For Li->Li compactions, it picks up the tables which would have the prefix. The
 // tables who only have keys with this prefix are quickly dropped. The ones which have other keys
 // are run through MergeIterator and compacted to create new tables. All the mechanisms of
 // compactions apply, i.e. level sizes and MANIFEST are updated as in the normal flow.
 func (s *levelsController) dropPrefixes(prefixes [][]byte) error {
 	opt := s.kv.opt
 	// Iterate levels in the reverse order because if we were to iterate from
 	// lower level (say level 0) to a higher level (say level 3) we could have
 	// a state in which level 0 is compacted and an older version of a key exists in lower level.
 	// At this point, if someone creates an iterator, they would see an old
 	// value for a key from lower levels. Iterating in reverse order ensures we
 	// drop the oldest data first so that lookups never return stale data.
 	for i := len(s.levels) - 1; i >= 0; i-- {
 		l := s.levels[i]

 		l.RLock()
 		if l.level == 0 {
 			size := len(l.tables)
 			l.RUnlock()

 			if size > 0 {
 				cp := compactionPriority{
 					level: 0,
 					score: 1.74,
 					// A unique number greater than 1.0 does two things. Helps identify this
 					// function in logs, and forces a compaction.
 					dropPrefixes: prefixes,
 				}
 				if err := s.doCompact(174, cp); err != nil {
 					opt.Warningf("While compacting level 0: %v", err)
 					return nil
 				}
 			}
 			continue
 		}

 		// Build a list of compaction tableGroups affecting all the prefixes we
 		// need to drop. We need to build tableGroups that satisfy the invariant that
 		// bottom tables are consecutive.
 		// tableGroup contains groups of consecutive tables.
 		var tableGroups [][]*table.Table
 		var tableGroup []*table.Table

 		finishGroup := func() {
 			if len(tableGroup) > 0 {
 				tableGroups = append(tableGroups, tableGroup)
 				tableGroup = nil
 			}
 		}

 		for _, table := range l.tables {
 			if containsAnyPrefixes(table, prefixes) {
 				tableGroup = append(tableGroup, table)
 			} else {
 				finishGroup()
 			}
 		}
 		finishGroup()

 		l.RUnlock()

 		if len(tableGroups) == 0 {
 			continue
 		}
 		_, span := otrace.StartSpan(context.Background(), "Badger.Compaction")
 		span.Annotatef(nil, "Compaction level: %v", l.level)
 		span.Annotatef(nil, "Drop Prefixes: %v", prefixes)
 		defer span.End()
 		opt.Infof("Dropping prefix at level %d (%d tableGroups)", l.level, len(tableGroups))
 		for _, operation := range tableGroups {
 			cd := compactDef{
 				span:         span,
 				thisLevel:    l,
 				nextLevel:    l,
 				top:          nil,
 				bot:          operation,
 				dropPrefixes: prefixes,
 				t:            s.levelTargets(),
 			}
 			cd.t.baseLevel = l.level
 			if err := s.runCompactDef(-1, l.level, cd); err != nil {
 				opt.Warningf("While running compact def: %+v. Error: %v", cd, err)
 				return err
 			}
 		}
 	}
 	return nil
 }

 func (s *levelsController) startCompact(lc *z.Closer) {
 	n := s.kv.opt.NumCompactors
 	lc.AddRunning(n - 1)
 	for i := 0; i < n; i++ {
 		go s.runCompactor(i, lc)
 	}
 }

 type targets struct {
 	baseLevel int
 	targetSz  []int64
 	fileSz    []int64
 }

 // levelTargets calculates the targets for levels in the LSM tree. The idea comes from Dynamic Level
 // Sizes ( https://rocksdb.org/blog/2015/07/23/dynamic-level.html ) in RocksDB. The sizes of levels
 // are calculated based on the size of the lowest level, typically L6. So, if L6 size is 1GB, then
 // L5 target size is 100MB, L4 target size is 10MB and so on.
 //
 // L0 files don't automatically go to L1. Instead, they get compacted to Lbase, where Lbase is
 // chosen based on the first level which is non-empty from top (check L1 through L6). For an empty
 // DB, that would be L6.  So, L0 compactions go to L6, then L5, L4 and so on.
 //
 // Lbase is advanced to the upper levels when its target size exceeds BaseLevelSize. For
 // example, when L6 reaches 1.1GB, then L4 target sizes becomes 11MB, thus exceeding the
 // BaseLevelSize of 10MB. L3 would then become the new Lbase, with a target size of 1MB <
 // BaseLevelSize.
 func (s *levelsController) levelTargets() targets {
 	adjust := func(sz int64) int64 {
 		if sz < s.kv.opt.BaseLevelSize {
 			return s.kv.opt.BaseLevelSize
 		}
 		return sz
 	}

 	t := targets{
 		targetSz: make([]int64, len(s.levels)),
 		fileSz:   make([]int64, len(s.levels)),
 	}
 	// DB size is the size of the last level.
 	dbSize := s.lastLevel().getTotalSize()
 	for i := len(s.levels) - 1; i > 0; i-- {
 		ltarget := adjust(dbSize)
 		t.targetSz[i] = ltarget
 		if t.baseLevel == 0 && ltarget <= s.kv.opt.BaseLevelSize {
 			t.baseLevel = i
 		}
 		dbSize /= int64(s.kv.opt.LevelSizeMultiplier)
 	}

 	tsz := s.kv.opt.BaseTableSize
 	for i := 0; i < len(s.levels); i++ {
 		if i == 0 {
 			// Use MemTableSize for Level 0. Because at Level 0, we stop compactions based on the
 			// number of tables, not the size of the level. So, having a 1:1 size ratio between
 			// memtable size and the size of L0 files is better than churning out 32 files per
 			// memtable (assuming 64MB MemTableSize and 2MB BaseTableSize).
 			t.fileSz[i] = s.kv.opt.MemTableSize
 		} else if i <= t.baseLevel {
 			t.fileSz[i] = tsz
 		} else {
 			tsz *= int64(s.kv.opt.TableSizeMultiplier)
 			t.fileSz[i] = tsz
 		}
 	}

 	// Bring the base level down to the last empty level.
 	for i := t.baseLevel + 1; i < len(s.levels)-1; i++ {
 		if s.levels[i].getTotalSize() > 0 {
 			break
 		}
 		t.baseLevel = i
 	}

 	// If the base level is empty and the next level size is less than the
 	// target size, pick the next level as the base level.
 	b := t.baseLevel
 	lvl := s.levels
 	if b < len(lvl)-1 && lvl[b].getTotalSize() == 0 && lvl[b+1].getTotalSize() < t.targetSz[b+1] {
 		t.baseLevel++
 	}
 	return t
 }

 func (s *levelsController) runCompactor(id int, lc *z.Closer) {
 	defer lc.Done()

 	randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond)
 	select {
 	case <-randomDelay.C:
 	case <-lc.HasBeenClosed():
 		randomDelay.Stop()
 		return
 	}

 	moveL0toFront := func(prios []compactionPriority) []compactionPriority {
 		idx := -1
 		for i, p := range prios {
 			if p.level == 0 {
 				idx = i
 				break
 			}
 		}
 		// If idx == -1, we didn't find L0.
 		// If idx == 0, then we don't need to do anything. L0 is already at the front.
 		if idx > 0 {
 			out := append([]compactionPriority{}, prios[idx])
 			out = append(out, prios[:idx]...)
 			out = append(out, prios[idx+1:]...)
 			return out
 		}
 		return prios
 	}

 	run := func(p compactionPriority) bool {
 		err := s.doCompact(id, p)
 		switch err {
 		case nil:
 			return true
 		case errFillTables:
 			// pass
 		default:
 			s.kv.opt.Warningf("While running doCompact: %v\n", err)
 		}
 		return false
 	}

 	var priosBuffer []compactionPriority
 	runOnce := func() bool {
 		prios := s.pickCompactLevels(priosBuffer)
 		defer func() {
 			priosBuffer = prios
 		}()
 		if id == 0 {
 			// Worker ID zero prefers to compact L0 always.
 			prios = moveL0toFront(prios)
 		}
 		for _, p := range prios {
 			if id == 0 && p.level == 0 {
 				// Allow worker zero to run level 0, irrespective of its adjusted score.
 			} else if p.adjusted < 1.0 {
 				break
 			}
 			if run(p) {
 				return true
 			}
 		}

 		return false
 	}

 	tryLmaxToLmaxCompaction := func() {
 		p := compactionPriority{
 			level: s.lastLevel().level,
 			t:     s.levelTargets(),
 		}
 		run(p)

 	}
 	count := 0
 	ticker := time.NewTicker(50 * time.Millisecond)
 	defer ticker.Stop()
 	for {
 		select {
 		// Can add a done channel or other stuff.
 		case <-ticker.C:
 			count++
 			// Each ticker is 50ms so 50*200=10seconds.
 			if s.kv.opt.LmaxCompaction && id == 2 && count >= 200 {
 				tryLmaxToLmaxCompaction()
 				count = 0
 			} else {
 				runOnce()
 			}
 		case <-lc.HasBeenClosed():
 			return
 		}
 	}
 }

 type compactionPriority struct {
 	level        int
 	score        float64
 	adjusted     float64
 	dropPrefixes [][]byte
 	t            targets
 }

 func (s *levelsController) lastLevel() *levelHandler {
 	return s.levels[len(s.levels)-1]
 }

 // pickCompactLevel determines which level to compact.
 // Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
 // It tries to reuse priosBuffer to reduce memory allocation,
 // passing nil is acceptable, then new memory will be allocated.
 func (s *levelsController) pickCompactLevels(priosBuffer []compactionPriority) (prios []compactionPriority) {
 	t := s.levelTargets()
 	addPriority := func(level int, score float64) {
 		pri := compactionPriority{
 			level:    level,
 			score:    score,
 			adjusted: score,
 			t:        t,
 		}
 		prios = append(prios, pri)
 	}

 	// Grow buffer to fit all levels.
 	if cap(priosBuffer) < len(s.levels) {
 		priosBuffer = make([]compactionPriority, 0, len(s.levels))
 	}
 	prios = priosBuffer[:0]

 	// Add L0 priority based on the number of tables.
 	addPriority(0, float64(s.levels[0].numTables())/float64(s.kv.opt.NumLevelZeroTables))

 	// All other levels use size to calculate priority.
 	for i := 1; i < len(s.levels); i++ {
 		// Don't consider those tables that are already being compacted right now.
 		delSize := s.cstatus.delSize(i)

 		l := s.levels[i]
 		sz := l.getTotalSize() - delSize
 		addPriority(i, float64(sz)/float64(t.targetSz[i]))
 	}
 	y.AssertTrue(len(prios) == len(s.levels))

 	// The following code is borrowed from PebbleDB and results in healthier LSM tree structure.
 	// If Li-1 has score > 1.0, then we'll divide Li-1 score by Li. If Li score is >= 1.0, then Li-1
 	// score is reduced, which means we'll prioritize the compaction of lower levels (L5, L4 and so
 	// on) over the higher levels (L0, L1 and so on). On the other hand, if Li score is < 1.0, then
 	// we'll increase the priority of Li-1.
 	// Overall what this means is, if the bottom level is already overflowing, then de-prioritize
 	// compaction of the above level. If the bottom level is not full, then increase the priority of
 	// above level.
 	var prevLevel int
 	for level := t.baseLevel; level < len(s.levels); level++ {
 		if prios[prevLevel].adjusted >= 1 {
 			// Avoid absurdly large scores by placing a floor on the score that we'll
 			// adjust a level by. The value of 0.01 was chosen somewhat arbitrarily
 			const minScore = 0.01
 			if prios[level].score >= minScore {
 				prios[prevLevel].adjusted /= prios[level].adjusted
 			} else {
 				prios[prevLevel].adjusted /= minScore
 			}
 		}
 		prevLevel = level
 	}

 	// Pick all the levels whose original score is >= 1.0, irrespective of their adjusted score.
 	// We'll still sort them by their adjusted score below. Having both these scores allows us to
 	// make better decisions about compacting L0. If we see a score >= 1.0, we can do L0->L0
 	// compactions. If the adjusted score >= 1.0, then we can do L0->Lbase compactions.
 	out := prios[:0]
 	for _, p := range prios[:len(prios)-1] {
 		if p.score >= 1.0 {
 			out = append(out, p)
 		}
 	}
 	prios = out

 	// Sort by the adjusted score.
 	sort.Slice(prios, func(i, j int) bool {
 		return prios[i].adjusted > prios[j].adjusted
 	})
 	return prios
 }

 // checkOverlap checks if the given tables overlap with any level from the given "lev" onwards.
 func (s *levelsController) checkOverlap(tables []*table.Table, lev int) bool {
 	kr := getKeyRange(tables...)
 	for i, lh := range s.levels {
 		if i < lev { // Skip upper levels.
 			continue
 		}
 		lh.RLock()
 		left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
 		lh.RUnlock()
 		if right-left > 0 {
 			return true
 		}
 	}
 	return false
 }

 // subcompact runs a single sub-compaction, iterating over the specified key-range only.
 //
 // We use splits to do a single compaction concurrently. If we have >= 3 tables
 // involved in the bottom level during compaction, we choose key ranges to
 // split the main compaction up into sub-compactions. Each sub-compaction runs
 // concurrently, only iterating over the provided key range, generating tables.
 // This speeds up the compaction significantly.
 func (s *levelsController) subcompact(it y.Iterator, kr keyRange, cd compactDef,
 	inflightBuilders *y.Throttle, res chan<- *table.Table) {

 	// Check overlap of the top level with the levels which are not being
 	// compacted in this compaction.
 	hasOverlap := s.checkOverlap(cd.allTables(), cd.nextLevel.level+1)

 	// Pick a discard ts, so we can discard versions below this ts. We should
 	// never discard any versions starting from above this timestamp, because
 	// that would affect the snapshot view guarantee provided by transactions.
 	discardTs := s.kv.orc.discardAtOrBelow()

 	// Try to collect stats so that we can inform value log about GC. That would help us find which
 	// value log file should be GCed.
 	discardStats := make(map[uint32]int64)
 	updateStats := func(vs y.ValueStruct) {
 		// We don't need to store/update discard stats when badger is running in Disk-less mode.
 		if s.kv.opt.InMemory {
 			return
 		}
 		if vs.Meta&bitValuePointer > 0 {
 			var vp valuePointer
 			vp.Decode(vs.Value)
 			discardStats[vp.Fid] += int64(vp.Len)
 		}
 	}

 	// exceedsAllowedOverlap returns true if the given key range would overlap with more than 10
 	// tables from level below nextLevel (nextLevel+1). This helps avoid generating tables at Li
 	// with huge overlaps with Li+1.
 	exceedsAllowedOverlap := func(kr keyRange) bool {
 		n2n := cd.nextLevel.level + 1
 		if n2n <= 1 || n2n >= len(s.levels) {
 			return false
 		}
 		n2nl := s.levels[n2n]
 		n2nl.RLock()
 		defer n2nl.RUnlock()

 		l, r := n2nl.overlappingTables(levelHandlerRLocked{}, kr)
 		return r-l >= 10
 	}

 	var (
 		lastKey, skipKey       []byte
 		numBuilds, numVersions int
 		// Denotes if the first key is a series of duplicate keys had
 		// "DiscardEarlierVersions" set
 		firstKeyHasDiscardSet bool
 	)

 	addKeys := func(builder *table.Builder) {
 		timeStart := time.Now()
 		var numKeys, numSkips uint64
 		var rangeCheck int
 		var tableKr keyRange
 		for ; it.Valid(); it.Next() {
 			// See if we need to skip the prefix.
 			if len(cd.dropPrefixes) > 0 && hasAnyPrefixes(it.Key(), cd.dropPrefixes) {
 				numSkips++
 				updateStats(it.Value())
 				continue
 			}

 			// See if we need to skip this key.
 			if len(skipKey) > 0 {
 				if y.SameKey(it.Key(), skipKey) {
 					numSkips++
 					updateStats(it.Value())
 					continue
 				} else {
 					skipKey = skipKey[:0]
 				}
 			}

 			if !y.SameKey(it.Key(), lastKey) {
 				firstKeyHasDiscardSet = false
 				if len(kr.right) > 0 && y.CompareKeys(it.Key(), kr.right) >= 0 {
 					break
 				}
 				if builder.ReachedCapacity() {
 					// Only break if we are on a different key, and have reached capacity. We want
 					// to ensure that all versions of the key are stored in the same sstable, and
 					// not divided across multiple tables at the same level.
 					break
 				}
 				lastKey = y.SafeCopy(lastKey, it.Key())
 				numVersions = 0
 				firstKeyHasDiscardSet = it.Value().Meta&bitDiscardEarlierVersions > 0

 				if len(tableKr.left) == 0 {
 					tableKr.left = y.SafeCopy(tableKr.left, it.Key())
 				}
 				tableKr.right = lastKey

 				rangeCheck++
 				if rangeCheck%5000 == 0 {
 					// This table's range exceeds the allowed range overlap with the level after
 					// next. So, we stop writing to this table. If we don't do this, then we end up
 					// doing very expensive compactions involving too many tables. To amortize the
 					// cost of this check, we do it only every N keys.
 					if exceedsAllowedOverlap(tableKr) {
 						// s.kv.opt.Debugf("L%d -> L%d Breaking due to exceedsAllowedOverlap with
 						// kr: %s\n", cd.thisLevel.level, cd.nextLevel.level, tableKr)
 						break
 					}
 				}
 			}

 			vs := it.Value()
 			version := y.ParseTs(it.Key())

 			isExpired := isDeletedOrExpired(vs.Meta, vs.ExpiresAt)

 			// Do not discard entries inserted by merge operator. These entries will be
 			// discarded once they're merged
 			if version <= discardTs && vs.Meta&bitMergeEntry == 0 {
 				// Keep track of the number of versions encountered for this key. Only consider the
 				// versions which are below the minReadTs, otherwise, we might end up discarding the
 				// only valid version for a running transaction.
 				numVersions++
 				// Keep the current version and discard all the next versions if
 				// - The `discardEarlierVersions` bit is set OR
 				// - We've already processed `NumVersionsToKeep` number of versions
 				// (including the current item being processed)
 				lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0 ||
 					numVersions == s.kv.opt.NumVersionsToKeep

 				if isExpired || lastValidVersion {
 					// If this version of the key is deleted or expired, skip all the rest of the
 					// versions. Ensure that we're only removing versions below readTs.
 					skipKey = y.SafeCopy(skipKey, it.Key())

 					switch {
 					// Add the key to the table only if it has not expired.
 					// We don't want to add the deleted/expired keys.
 					case !isExpired && lastValidVersion:
 						// Add this key. We have set skipKey, so the following key versions
 						// would be skipped.
 					case hasOverlap:
 						// If this key range has overlap with lower levels, then keep the deletion
 						// marker with the latest version, discarding the rest. We have set skipKey,
 						// so the following key versions would be skipped.
 					default:
 						// If no overlap, we can skip all the versions, by continuing here.
 						numSkips++
 						updateStats(vs)
 						continue // Skip adding this key.
 					}
 				}
 			}
 			numKeys++
 			var vp valuePointer
 			if vs.Meta&bitValuePointer > 0 {
 				vp.Decode(vs.Value)
 			}
 			switch {
 			case firstKeyHasDiscardSet:
 				// This key is same as the last key which had "DiscardEarlierVersions" set. The
 				// the next compactions will drop this key if its ts >
 				// discardTs (of the next compaction).
 				builder.AddStaleKey(it.Key(), vs, vp.Len)
 			case isExpired:
 				// If the key is expired, the next compaction will drop it if
 				// its ts > discardTs (of the next compaction).
 				builder.AddStaleKey(it.Key(), vs, vp.Len)
 			default:
 				builder.Add(it.Key(), vs, vp.Len)
 			}
 		}
 		s.kv.opt.Debugf("[%d] LOG Compact. Added %d keys. Skipped %d keys. Iteration took: %v",
 			cd.compactorId, numKeys, numSkips, time.Since(timeStart).Round(time.Millisecond))
 	} // End of function: addKeys

 	if len(kr.left) > 0 {
 		it.Seek(kr.left)
 	} else {
 		it.Rewind()
 	}
 	for it.Valid() {
 		if len(kr.right) > 0 && y.CompareKeys(it.Key(), kr.right) >= 0 {
 			break
 		}

 		bopts := buildTableOptions(s.kv)
 		// Set TableSize to the target file size for that level.
 		bopts.TableSize = uint64(cd.t.fileSz[cd.nextLevel.level])
 		builder := table.NewTableBuilder(bopts)

 		// This would do the iteration and add keys to builder.
 		addKeys(builder)

 		// It was true that it.Valid() at least once in the loop above, which means we
 		// called Add() at least once, and builder is not Empty().
 		if builder.Empty() {
 			// Cleanup builder resources:
 			builder.Finish()
 			builder.Close()
 			continue
 		}
 		numBuilds++
 		if err := inflightBuilders.Do(); err != nil {
 			// Can't return from here, until I decrRef all the tables that I built so far.
 			break
 		}
 		go func(builder *table.Builder, fileID uint64) {
 			var err error
 			defer inflightBuilders.Done(err)
 			defer builder.Close()

 			var tbl *table.Table
 			if s.kv.opt.InMemory {
 				tbl, err = table.OpenInMemoryTable(builder.Finish(), fileID, &bopts)
 			} else {
 				fname := table.NewFilename(fileID, s.kv.opt.Dir)
 				tbl, err = table.CreateTable(fname, builder)
 			}

 			// If we couldn't build the table, return fast.
 			if err != nil {
 				return
 			}
 			res <- tbl
 		}(builder, s.reserveFileID())
 	}
 	s.kv.vlog.updateDiscardStats(discardStats)
 	s.kv.opt.Debugf("Discard stats: %v", discardStats)
 }

 // compactBuildTables merges topTables and botTables to form a list of new tables.
 func (s *levelsController) compactBuildTables(
 	lev int, cd compactDef) ([]*table.Table, func() error, error) {

 	topTables := cd.top
 	botTables := cd.bot

 	numTables := int64(len(topTables) + len(botTables))
 	y.NumCompactionTablesAdd(s.kv.opt.MetricsEnabled, numTables)
 	defer y.NumCompactionTablesAdd(s.kv.opt.MetricsEnabled, -numTables)

 	cd.span.Annotatef(nil, "Top tables count: %v Bottom tables count: %v",
 		len(topTables), len(botTables))

 	keepTable := func(t *table.Table) bool {
 		for _, prefix := range cd.dropPrefixes {
 			if bytes.HasPrefix(t.Smallest(), prefix) &&
 				bytes.HasPrefix(t.Biggest(), prefix) {
 				// All the keys in this table have the dropPrefix. So, this
 				// table does not need to be in the iterator and can be
 				// dropped immediately.
 				return false
 			}
 		}
 		return true
 	}
 	var valid []*table.Table
 	for _, table := range botTables {
 		if keepTable(table) {
 			valid = append(valid, table)
 		}
 	}

 	newIterator := func() []y.Iterator {
 		// Create iterators across all the tables involved first.
 		var iters []y.Iterator
 		switch {
 		case lev == 0:
 			iters = append(iters, iteratorsReversed(topTables, table.NOCACHE)...)
 		case len(topTables) > 0:
 			y.AssertTrue(len(topTables) == 1)
 			iters = []y.Iterator{topTables[0].NewIterator(table.NOCACHE)}
 		}
 		// Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap.
 		return append(iters, table.NewConcatIterator(valid, table.NOCACHE))
 	}

 	res := make(chan *table.Table, 3)
 	inflightBuilders := y.NewThrottle(8 + len(cd.splits))
 	for _, kr := range cd.splits {
 		// Initiate Do here so we can register the goroutines for buildTables too.
 		if err := inflightBuilders.Do(); err != nil {
 			s.kv.opt.Errorf("cannot start subcompaction: %+v", err)
 			return nil, nil, err
 		}
 		go func(kr keyRange) {
 			defer inflightBuilders.Done(nil)
 			it := table.NewMergeIterator(newIterator(), false)
 			defer it.Close()
 			s.subcompact(it, kr, cd, inflightBuilders, res)
 		}(kr)
 	}

 	var newTables []*table.Table
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
 		for t := range res {
 			newTables = append(newTables, t)
 		}
 	}()

 	// Wait for all table builders to finish and also for newTables accumulator to finish.
 	err := inflightBuilders.Finish()
 	close(res)
 	wg.Wait() // Wait for all tables to be picked up.

 	if err == nil {
 		// Ensure created files' directory entries are visible.  We don't mind the extra latency
 		// from not doing this ASAP after all file creation has finished because this is a
 		// background operation.
 		err = s.kv.syncDir(s.kv.opt.Dir)
 	}

 	if err != nil {
 		// An error happened.  Delete all the newly created table files (by calling DecrRef
 		// -- we're the only holders of a ref).
 		_ = decrRefs(newTables)
 		return nil, nil, y.Wrapf(err, "while running compactions for: %+v", cd)
 	}

 	sort.Slice(newTables, func(i, j int) bool {
 		return y.CompareKeys(newTables[i].Biggest(), newTables[j].Biggest()) < 0
 	})
 	return newTables, func() error { return decrRefs(newTables) }, nil
 }

 func buildChangeSet(cd *compactDef, newTables []*table.Table) pb.ManifestChangeSet {
 	changes := []*pb.ManifestChange{}
 	for _, table := range newTables {
 		changes = append(changes,
 			newCreateChange(table.ID(), cd.nextLevel.level, table.KeyID(), table.CompressionType()))
 	}
 	for _, table := range cd.top {
 		// Add a delete change only if the table is not in memory.
 		if !table.IsInmemory {
 			changes = append(changes, newDeleteChange(table.ID()))
 		}
 	}
 	for _, table := range cd.bot {
 		changes = append(changes, newDeleteChange(table.ID()))
 	}
 	return pb.ManifestChangeSet{Changes: changes}
 }

 func hasAnyPrefixes(s []byte, listOfPrefixes [][]byte) bool {
 	for _, prefix := range listOfPrefixes {
 		if bytes.HasPrefix(s, prefix) {
 			return true
 		}
 	}

 	return false
 }

 func containsPrefix(table *table.Table, prefix []byte) bool {
 	smallValue := table.Smallest()
 	largeValue := table.Biggest()
 	if bytes.HasPrefix(smallValue, prefix) {
 		return true
 	}
 	if bytes.HasPrefix(largeValue, prefix) {
 		return true
 	}
 	isPresent := func() bool {
 		ti := table.NewIterator(0)
 		defer ti.Close()
 		// In table iterator's Seek, we assume that key has version in last 8 bytes. We set
 		// version=0 (ts=math.MaxUint64), so that we don't skip the key prefixed with prefix.
 		ti.Seek(y.KeyWithTs(prefix, math.MaxUint64))
 		return bytes.HasPrefix(ti.Key(), prefix)
 	}

 	if bytes.Compare(prefix, smallValue) > 0 &&
 		bytes.Compare(prefix, largeValue) < 0 {
 		// There may be a case when table contains [0x0000,...., 0xffff]. If we are searching for
 		// k=0x0011, we should not directly infer that k is present. It may not be present.
 		return isPresent()
 	}

 	return false
 }

 func containsAnyPrefixes(table *table.Table, listOfPrefixes [][]byte) bool {
 	for _, prefix := range listOfPrefixes {
 		if containsPrefix(table, prefix) {
 			return true
 		}
 	}

 	return false
 }

 type compactDef struct {
 	span *otrace.Span

 	compactorId int
 	t           targets
 	p           compactionPriority
 	thisLevel   *levelHandler
 	nextLevel   *levelHandler

 	top []*table.Table
 	bot []*table.Table

 	thisRange keyRange
 	nextRange keyRange
 	splits    []keyRange

 	thisSize int64

 	dropPrefixes [][]byte
 }

 // addSplits can allow us to run multiple sub-compactions in parallel across the split key ranges.
 func (s *levelsController) addSplits(cd *compactDef) {
 	cd.splits = cd.splits[:0]

 	// Let's say we have 10 tables in cd.bot and min width = 3. Then, we'll pick
 	// 0, 1, 2 (pick), 3, 4, 5 (pick), 6, 7, 8 (pick), 9 (pick, because last table).
 	// This gives us 4 picks for 10 tables.
 	// In an edge case, 142 tables in bottom led to 48 splits. That's too many splits, because it
 	// then uses up a lot of memory for table builder.
 	// We should keep it so we have at max 5 splits.
 	width := int(math.Ceil(float64(len(cd.bot)) / 5.0))
 	if width < 3 {
 		width = 3
 	}
 	skr := cd.thisRange
 	skr.extend(cd.nextRange)

 	addRange := func(right []byte) {
 		skr.right = y.Copy(right)
 		cd.splits = append(cd.splits, skr)

 		skr.left = skr.right
 	}

 	for i, t := range cd.bot {
 		// last entry in bottom table.
 		if i == len(cd.bot)-1 {
 			addRange([]byte{})
 			return
 		}
 		if i%width == width-1 {
 			// Right is assigned ts=0. The encoding ts bytes take MaxUint64-ts,
 			// so, those with smaller TS will be considered larger for the same key.
 			// Consider the following.
 			// Top table is [A1...C3(deleted)]
 			// bot table is [B1....C2]
 			// It will generate a split [A1 ... C0], including any records of Key C.
 			right := y.KeyWithTs(y.ParseKey(t.Biggest()), 0)
 			addRange(right)
 		}
 	}
 }

 func (cd *compactDef) lockLevels() {
 	cd.thisLevel.RLock()
 	cd.nextLevel.RLock()
 }

 func (cd *compactDef) unlockLevels() {
 	cd.nextLevel.RUnlock()
 	cd.thisLevel.RUnlock()
 }

 func (cd *compactDef) allTables() []*table.Table {
 	ret := make([]*table.Table, 0, len(cd.top)+len(cd.bot))
 	ret = append(ret, cd.top...)
 	ret = append(ret, cd.bot...)
 	return ret
 }

 func (s *levelsController) fillTablesL0ToL0(cd *compactDef) bool {
 	if cd.compactorId != 0 {
 		// Only compactor zero can work on this.
 		return false
 	}

 	cd.nextLevel = s.levels[0]
 	cd.nextRange = keyRange{}
 	cd.bot = nil

 	// Because this level and next level are both level 0, we should NOT acquire
 	// the read lock twice, because it can result in a deadlock. So, we don't
 	// call compactDef.lockLevels, instead locking the level only once and
 	// directly here.
 	//
 	// As per godocs on RWMutex:
 	// If a goroutine holds a RWMutex for reading and another goroutine might
 	// call Lock, no goroutine should expect to be able to acquire a read lock
 	// until the initial read lock is released. In particular, this prohibits
 	// recursive read locking. This is to ensure that the lock eventually
 	// becomes available; a blocked Lock call excludes new readers from
 	// acquiring the lock.
 	y.AssertTrue(cd.thisLevel.level == 0)
 	y.AssertTrue(cd.nextLevel.level == 0)
 	s.levels[0].RLock()
 	defer s.levels[0].RUnlock()

 	s.cstatus.Lock()
 	defer s.cstatus.Unlock()

 	top := cd.thisLevel.tables
 	var out []*table.Table
 	now := time.Now()
 	for _, t := range top {
 		if t.Size() >= 2*cd.t.fileSz[0] {
 			// This file is already big, don't include it.
 			continue
 		}
 		if now.Sub(t.CreatedAt) < 10*time.Second {
 			// Just created it 10s ago. Don't pick for compaction.
 			continue
 		}
 		if _, beingCompacted := s.cstatus.tables[t.ID()]; beingCompacted {
 			continue
 		}
 		out = append(out, t)
 	}

 	if len(out) < 4 {
 		// If we don't have enough tables to merge in L0, don't do it.
 		return false
 	}
 	cd.thisRange = infRange
 	cd.top = out

 	// Avoid any other L0 -> Lbase from happening, while this is going on.
 	thisLevel := s.cstatus.levels[cd.thisLevel.level]
 	thisLevel.ranges = append(thisLevel.ranges, infRange)
 	for _, t := range out {
 		s.cstatus.tables[t.ID()] = struct{}{}
 	}

 	// For L0->L0 compaction, we set the target file size to max, so the output is always one file.
 	// This significantly decreases the L0 table stalls and improves the performance.
 	cd.t.fileSz[0] = math.MaxUint32
 	return true
 }

 func (s *levelsController) fillTablesL0ToLbase(cd *compactDef) bool {
 	if cd.nextLevel.level == 0 {
 		panic("Base level can't be zero.")
 	}
 	// We keep cd.p.adjusted > 0.0 here to allow functions in db.go to artificially trigger
 	// L0->Lbase compactions. Those functions wouldn't be setting the adjusted score.
 	if cd.p.adjusted > 0.0 && cd.p.adjusted < 1.0 {
 		// Do not compact to Lbase if adjusted score is less than 1.0.
 		return false
 	}
 	cd.lockLevels()
 	defer cd.unlockLevels()

 	top := cd.thisLevel.tables
 	if len(top) == 0 {
 		return false
 	}

 	var out []*table.Table
 	if len(cd.dropPrefixes) > 0 {
 		// Use all tables if drop prefix is set. We don't want to compact only a
 		// sub-range. We want to compact all the tables.
 		out = top

 	} else {
 		var kr keyRange
 		// cd.top[0] is the oldest file. So we start from the oldest file first.
 		for _, t := range top {
 			dkr := getKeyRange(t)
 			if kr.overlapsWith(dkr) {
 				out = append(out, t)
 				kr.extend(dkr)
 			} else {
 				break
 			}
 		}
 	}
 	cd.thisRange = getKeyRange(out...)
 	cd.top = out

 	left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)
 	cd.bot = make([]*table.Table, right-left)
 	copy(cd.bot, cd.nextLevel.tables[left:right])

 	if len(cd.bot) == 0 {
 		cd.nextRange = cd.thisRange
 	} else {
 		cd.nextRange = getKeyRange(cd.bot...)
 	}
 	return s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd)
 }

 // fillTablesL0 would try to fill tables from L0 to be compacted with Lbase. If
 // it can not do that, it would try to compact tables from L0 -> L0.
 //
 // Say L0 has 10 tables.
 // fillTablesL0ToLbase picks up 5 tables to compact from L0 -> L5.
 // Next call to fillTablesL0 would run L0ToLbase again, which fails this time.
 // So, instead, we run fillTablesL0ToL0, which picks up rest of the 5 tables to
 // be compacted within L0. Additionally, it would set the compaction range in
 // cstatus to inf, so no other L0 -> Lbase compactions can happen.
 // Thus, L0 -> L0 must finish for the next L0 -> Lbase to begin.
 func (s *levelsController) fillTablesL0(cd *compactDef) bool {
 	if ok := s.fillTablesL0ToLbase(cd); ok {
 		return true
 	}
 	return s.fillTablesL0ToL0(cd)
 }

 // sortByStaleData sorts tables based on the amount of stale data they have.
 // This is useful in removing tombstones.
 func (s *levelsController) sortByStaleDataSize(tables []*table.Table, cd *compactDef) {
 	if len(tables) == 0 || cd.nextLevel == nil {
 		return
 	}

 	sort.Slice(tables, func(i, j int) bool {
 		return tables[i].StaleDataSize() > tables[j].StaleDataSize()
 	})
 }

 // sortByHeuristic sorts tables in increasing order of MaxVersion, so we
 // compact older tables first.
 func (s *levelsController) sortByHeuristic(tables []*table.Table, cd *compactDef) {
 	if len(tables) == 0 || cd.nextLevel == nil {
 		return
 	}

 	// Sort tables by max version. This is what RocksDB does.
 	sort.Slice(tables, func(i, j int) bool {
 		return tables[i].MaxVersion() < tables[j].MaxVersion()
 	})
 }

 // This function should be called with lock on levels.
 func (s *levelsController) fillMaxLevelTables(tables []*table.Table, cd *compactDef) bool {
 	sortedTables := make([]*table.Table, len(tables))
 	copy(sortedTables, tables)
 	s.sortByStaleDataSize(sortedTables, cd)

 	if len(sortedTables) > 0 && sortedTables[0].StaleDataSize() == 0 {
 		// This is a maxLevel to maxLevel compaction and we don't have any stale data.
 		return false
 	}
 	cd.bot = []*table.Table{}
 	collectBotTables := func(t *table.Table, needSz int64) {
 		totalSize := t.Size()

 		j := sort.Search(len(tables), func(i int) bool {
 			return y.CompareKeys(tables[i].Smallest(), t.Smallest()) >= 0
 		})
 		y.AssertTrue(tables[j].ID() == t.ID())
 		j++
 		// Collect tables until we reach the the required size.
 		for j < len(tables) {
 			newT := tables[j]
 			totalSize += newT.Size()

 			if totalSize >= needSz {
 				break
 			}
 			cd.bot = append(cd.bot, newT)
 			cd.nextRange.extend(getKeyRange(newT))
 			j++
 		}
 	}
 	now := time.Now()
 	for _, t := range sortedTables {
 		// If the maxVersion is above the discardTs, we won't clean anything in
 		// the compaction. So skip this table.
 		if t.MaxVersion() > s.kv.orc.discardAtOrBelow() {
 			continue
 		}
 		if now.Sub(t.CreatedAt) < time.Hour {
 			// Just created it an hour ago. Don't pick for compaction.
 			continue
 		}
 		// If the stale data size is less than 10 MB, it might not be worth
 		// rewriting the table. Skip it.
 		if t.StaleDataSize() < 10<<20 {
 			continue
 		}

 		cd.thisSize = t.Size()
 		cd.thisRange = getKeyRange(t)
 		// Set the next range as the same as the current range. If we don't do
 		// this, we won't be able to run more than one max level compactions.
 		cd.nextRange = cd.thisRange
 		// If we're already compacting this range, don't do anything.
 		if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) {
 			continue
 		}

 		// Found a valid table!
 		cd.top = []*table.Table{t}

 		needFileSz := cd.t.fileSz[cd.thisLevel.level]
 		// The table size is what we want so no need to collect more tables.
 		if t.Size() >= needFileSz {
 			break
 		}
 		// TableSize is less than what we want. Collect more tables for compaction.
 		// If the level has multiple small tables, we collect all of them
 		// together to form a bigger table.
 		collectBotTables(t, needFileSz)
 		if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
 			cd.bot = cd.bot[:0]
 			cd.nextRange = keyRange{}
 			continue
 		}
 		return true
 	}
 	if len(cd.top) == 0 {
 		return false
 	}

 	return s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd)
 }

 func (s *levelsController) fillTables(cd *compactDef) bool {
 	cd.lockLevels()
 	defer cd.unlockLevels()

 	tables := make([]*table.Table, len(cd.thisLevel.tables))
 	copy(tables, cd.thisLevel.tables)
 	if len(tables) == 0 {
 		return false
 	}
 	// We're doing a maxLevel to maxLevel compaction. Pick tables based on the stale data size.
 	if cd.thisLevel.isLastLevel() {
 		return s.fillMaxLevelTables(tables, cd)
 	}
 	// We pick tables, so we compact older tables first. This is similar to
 	// kOldestLargestSeqFirst in RocksDB.
 	s.sortByHeuristic(tables, cd)

 	for _, t := range tables {
 		cd.thisSize = t.Size()
 		cd.thisRange = getKeyRange(t)
 		// If we're already compacting this range, don't do anything.
 		if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) {
 			continue
 		}
 		cd.top = []*table.Table{t}
 		left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)

 		cd.bot = make([]*table.Table, right-left)
 		copy(cd.bot, cd.nextLevel.tables[left:right])

 		if len(cd.bot) == 0 {
 			cd.bot = []*table.Table{}
 			cd.nextRange = cd.thisRange
 			if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
 				continue
 			}
 			return true
 		}
 		cd.nextRange = getKeyRange(cd.bot...)

 		if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) {
 			continue
 		}
 		if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
 			continue
 		}
 		return true
 	}
 	return false
 }

 func (s *levelsController) runCompactDef(id, l int, cd compactDef) (err error) {
 	if len(cd.t.fileSz) == 0 {
 		return errors.New("Filesizes cannot be zero. Targets are not set")
 	}
 	timeStart := time.Now()

 	thisLevel := cd.thisLevel
 	nextLevel := cd.nextLevel

 	y.AssertTrue(len(cd.splits) == 0)
 	if thisLevel.level == nextLevel.level {
 		// don't do anything for L0 -> L0 and Lmax -> Lmax.
 	} else {
 		s.addSplits(&cd)
 	}
 	if len(cd.splits) == 0 {
 		cd.splits = append(cd.splits, keyRange{})
 	}

 	// Table should never be moved directly between levels,
 	// always be rewritten to allow discarding invalid versions.

 	newTables, decr, err := s.compactBuildTables(l, cd)
 	if err != nil {
 		return err
 	}
 	defer func() {
 		// Only assign to err, if it's not already nil.
 		if decErr := decr(); err == nil {
 			err = decErr
 		}
 	}()
 	changeSet := buildChangeSet(&cd, newTables)

 	// We write to the manifest _before_ we delete files (and after we created files)
 	if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil {
 		return err
 	}

 	getSizes := func(tables []*table.Table) int64 {
 		size := int64(0)
 		for _, i := range tables {
 			size += i.Size()
 		}
 		return size
 	}

 	sizeNewTables := int64(0)
 	sizeOldTables := int64(0)
 	if s.kv.opt.MetricsEnabled {
 		sizeNewTables = getSizes(newTables)
 		sizeOldTables = getSizes(cd.bot) + getSizes(cd.top)
 		y.NumBytesCompactionWrittenAdd(s.kv.opt.MetricsEnabled, nextLevel.strLevel, sizeNewTables)
 	}

 	// See comment earlier in this function about the ordering of these ops, and the order in which
 	// we access levels when reading.
 	if err := nextLevel.replaceTables(cd.bot, newTables); err != nil {
 		return err
 	}
 	if err := thisLevel.deleteTables(cd.top); err != nil {
 		return err
 	}

 	// Note: For level 0, while doCompact is running, it is possible that new tables are added.
 	// However, the tables are added only to the end, so it is ok to just delete the first table.

 	from := append(tablesToString(cd.top), tablesToString(cd.bot)...)
 	to := tablesToString(newTables)
 	if dur := time.Since(timeStart); dur > 2*time.Second {
 		var expensive string
 		if dur > time.Second {
 			expensive = " [E]"
 		}
 		s.kv.opt.Infof("[%d]%s LOG Compact %d->%d (%d, %d -> %d tables with %d splits)."+
 			" [%s] -> [%s], took %v\n, deleted %d bytes",
 			id, expensive, thisLevel.level, nextLevel.level, len(cd.top), len(cd.bot),
 			len(newTables), len(cd.splits), strings.Join(from, " "), strings.Join(to, " "),
 			dur.Round(time.Millisecond), sizeOldTables-sizeNewTables)
 	}

 	if cd.thisLevel.level != 0 && len(newTables) > 2*s.kv.opt.LevelSizeMultiplier {
 		s.kv.opt.Infof("This Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n",
 			len(cd.top), hex.Dump(cd.thisRange.left), hex.Dump(cd.thisRange.right))
 		s.kv.opt.Infof("Next Range (numTables: %d)\nLeft:\n%s\nRight:\n%s\n",
 			len(cd.bot), hex.Dump(cd.nextRange.left), hex.Dump(cd.nextRange.right))
 	}
 	return nil
 }

 func tablesToString(tables []*table.Table) []string {
 	var res []string
 	for _, t := range tables {
 		res = append(res, fmt.Sprintf("%05d", t.ID()))
 	}
 	res = append(res, ".")
 	return res
 }

 var errFillTables = stderrors.New("Unable to fill tables")

 // doCompact picks some table on level l and compacts it away to the next level.
 func (s *levelsController) doCompact(id int, p compactionPriority) error {
 	l := p.level
 	y.AssertTrue(l < s.kv.opt.MaxLevels) // Sanity check.
 	if p.t.baseLevel == 0 {
 		p.t = s.levelTargets()
 	}

 	_, span := otrace.StartSpan(context.Background(), "Badger.Compaction")
 	defer span.End()

 	cd := compactDef{
 		compactorId:  id,
 		span:         span,
 		p:            p,
 		t:            p.t,
 		thisLevel:    s.levels[l],
 		dropPrefixes: p.dropPrefixes,
 	}

 	// While picking tables to be compacted, both levels' tables are expected to
 	// remain unchanged.
 	if l == 0 {
 		cd.nextLevel = s.levels[p.t.baseLevel]
 		if !s.fillTablesL0(&cd) {
 			return errFillTables
 		}
 	} else {
 		cd.nextLevel = cd.thisLevel
 		// We're not compacting the last level so pick the next level.
 		if !cd.thisLevel.isLastLevel() {
 			cd.nextLevel = s.levels[l+1]
 		}
 		if !s.fillTables(&cd) {
 			return errFillTables
 		}
 	}
 	defer s.cstatus.delete(cd) // Remove the ranges from compaction status.

 	span.Annotatef(nil, "Compaction: %+v", cd)
 	if err := s.runCompactDef(id, l, cd); err != nil {
 		// This compaction couldn't be done successfully.
 		s.kv.opt.Warningf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd)
 		return err
 	}

 	s.kv.opt.Debugf("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.level)
 	return nil
 }

 func (s *levelsController) addLevel0Table(t *table.Table) error {
 	// Add table to manifest file only if it is not opened in memory. We don't want to add a table
 	// to the manifest file if it exists only in memory.
 	if !t.IsInmemory {
 		// We update the manifest _before_ the table becomes part of a levelHandler, because at that
 		// point it could get used in some compaction.  This ensures the manifest file gets updated in
 		// the proper order. (That means this update happens before that of some compaction which
 		// deletes the table.)
 		err := s.kv.manifest.addChanges([]*pb.ManifestChange{
 			newCreateChange(t.ID(), 0, t.KeyID(), t.CompressionType()),
 		})
 		if err != nil {
 			return err
 		}
 	}

 	for !s.levels[0].tryAddLevel0Table(t) {
 		// Before we unstall, we need to make sure that level 0 is healthy.
 		timeStart := time.Now()
 		for s.levels[0].numTables() >= s.kv.opt.NumLevelZeroTablesStall {
 			time.Sleep(10 * time.Millisecond)
 		}
 		dur := time.Since(timeStart)
 		if dur > time.Second {
 			s.kv.opt.Infof("L0 was stalled for %s\n", dur.Round(time.Millisecond))
 		}
 		s.l0stallsMs.Add(int64(dur.Round(time.Millisecond)))
 	}

 	return nil
 }

 func (s *levelsController) close() error {
 	err := s.cleanupLevels()
 	return y.Wrap(err, "levelsController.Close")
 }

 // get searches for a given key in all the levels of the LSM tree. It returns
 // key version <= the expected version (version in key). If not found,
 // it returns an empty y.ValueStruct.
 func (s *levelsController) get(key []byte, maxVs y.ValueStruct, startLevel int) (
 	y.ValueStruct, error) {
 	if s.kv.IsClosed() {
 		return y.ValueStruct{}, ErrDBClosed
 	}
 	// It's important that we iterate the levels from 0 on upward. The reason is, if we iterated
 	// in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could
 	// read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do
 	// parallelize this, we will need to call the h.RLock() function by increasing order of level
 	// number.)
 	version := y.ParseTs(key)
 	for _, h := range s.levels {
 		// Ignore all levels below startLevel. This is useful for GC when L0 is kept in memory.
 		if h.level < startLevel {
 			continue
 		}
 		vs, err := h.get(key) // Calls h.RLock() and h.RUnlock().
 		if err != nil {
 			return y.ValueStruct{}, y.Wrapf(err, "get key: %q", key)
 		}
 		if vs.Value == nil && vs.Meta == 0 {
 			continue
 		}
 		y.NumBytesReadsLSMAdd(s.kv.opt.MetricsEnabled, int64(len(vs.Value)))
 		if vs.Version == version {
 			return vs, nil
 		}
 		if maxVs.Version < vs.Version {
 			maxVs = vs
 		}
 	}
 	if len(maxVs.Value) > 0 {
 		y.NumGetsWithResultsAdd(s.kv.opt.MetricsEnabled, 1)
 	}
 	return maxVs, nil
 }

 func iteratorsReversed(th []*table.Table, opt int) []y.Iterator {
 	out := make([]y.Iterator, 0, len(th))
 	for i := len(th) - 1; i >= 0; i-- {
 		// This will increment the reference of the table handler.
 		out = append(out, th[i].NewIterator(opt))
 	}
 	return out
 }

 // getTables return tables from all levels. It would call IncrRef on all returned tables.
 func (s *levelsController) getTables(opt *IteratorOptions) [][]*table.Table {
 	res := make([][]*table.Table, 0, len(s.levels))
 	for _, level := range s.levels {
 		res = append(res, level.getTables(opt))
 	}
 	return res
 }

 // iterators returns an array of iterators, for merging.
 // Note: This obtains references for the table handlers. Remember to close these iterators.
 func (s *levelsController) iterators(opt *IteratorOptions) []y.Iterator {
 	// Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing
 	// data when there's a compaction.
 	itrs := make([]y.Iterator, 0, len(s.levels))
 	for _, level := range s.levels {
 		itrs = append(itrs, level.iterators(opt)...)
 	}
 	return itrs
 }

 // TableInfo represents the information about a table.
 type TableInfo struct {
 	ID               uint64
 	Level            int
 	Left             []byte
 	Right            []byte
 	KeyCount         uint32 // Number of keys in the table
 	OnDiskSize       uint32
 	StaleDataSize    uint32
 	UncompressedSize uint32
 	MaxVersion       uint64
 	IndexSz          int
 	BloomFilterSize  int
 }

 func (s *levelsController) getTableInfo() (result []TableInfo) {
 	for _, l := range s.levels {
 		l.RLock()
 		for _, t := range l.tables {
 			info := TableInfo{
 				ID:               t.ID(),
 				Level:            l.level,
 				Left:             t.Smallest(),
 				Right:            t.Biggest(),
 				KeyCount:         t.KeyCount(),
 				OnDiskSize:       t.OnDiskSize(),
 				StaleDataSize:    t.StaleDataSize(),
 				IndexSz:          t.IndexSize(),
 				BloomFilterSize:  t.BloomFilterSize(),
 				UncompressedSize: t.UncompressedSize(),
 				MaxVersion:       t.MaxVersion(),
 			}
 			result = append(result, info)
 		}
 		l.RUnlock()
 	}
 	sort.Slice(result, func(i, j int) bool {
 		if result[i].Level != result[j].Level {
 			return result[i].Level < result[j].Level
 		}
 		return result[i].ID < result[j].ID
 	})
 	return
 }

 type LevelInfo struct {
 	Level          int
 	NumTables      int
 	Size           int64
 	TargetSize     int64
 	TargetFileSize int64
 	IsBaseLevel    bool
 	Score          float64
 	Adjusted       float64
 	StaleDatSize   int64
 }

 func (s *levelsController) getLevelInfo() []LevelInfo {
 	t := s.levelTargets()
 	prios := s.pickCompactLevels(nil)
 	result := make([]LevelInfo, len(s.levels))
 	for i, l := range s.levels {
 		l.RLock()
 		result[i].Level = i
 		result[i].Size = l.totalSize
 		result[i].NumTables = len(l.tables)
 		result[i].StaleDatSize = l.totalStaleSize

 		l.RUnlock()

 		result[i].TargetSize = t.targetSz[i]
 		result[i].TargetFileSize = t.fileSz[i]
 		result[i].IsBaseLevel = t.baseLevel == i
 	}
 	for _, p := range prios {
 		result[p.level].Score = p.score
 		result[p.level].Adjusted = p.adjusted
 	}
 	return result
 }

 // verifyChecksum verifies checksum for all tables on all levels.
 func (s *levelsController) verifyChecksum() error {
 	var tables []*table.Table
 	for _, l := range s.levels {
 		l.RLock()
 		tables = tables[:0]
 		for _, t := range l.tables {
 			tables = append(tables, t)
 			t.IncrRef()
 		}
 		l.RUnlock()

 		for _, t := range tables {
 			errChkVerify := t.VerifyChecksum()
 			if err := t.DecrRef(); err != nil {
 				s.kv.opt.Errorf("unable to decrease reference of table: %s while "+
 					"verifying checksum with error: %s", t.Filename(), err)
 			}

 			if errChkVerify != nil {
 				return errChkVerify
 			}
 		}
 	}

 	return nil
 }

 // Returns the sorted list of splits for all the levels and tables based
 // on the block offsets.
 func (s *levelsController) keySplits(numPerTable int, prefix []byte) []string {
 	splits := make([]string, 0)
 	for _, l := range s.levels {
 		l.RLock()
 		for _, t := range l.tables {
 			tableSplits := t.KeySplits(numPerTable, prefix)
 			splits = append(splits, tableSplits...)
 		}
 		l.RUnlock()
 	}
 	sort.Strings(splits)
 	return splits
 }

 // AddTable builds the table from the KV.value options passed through the KV.Key.
 func (lc *levelsController) AddTable(
 	kv *pb.KV, lev int, dk *pb.DataKey, change *pb.ManifestChange) error {
 	// TODO: Encryption / Decryption might be required for the table, if the sender and receiver
 	// don't have same encryption mode. See if inplace encryption/decryption can be done.
 	// Tables are sent in the sorted order, so no need to sort them here.
 	encrypted := len(lc.kv.opt.EncryptionKey) > 0
 	y.AssertTrue((dk != nil && encrypted) || (dk == nil && !encrypted))
 	// The keyId is zero if there is no encryption.
 	opts := buildTableOptions(lc.kv)
 	opts.Compression = options.CompressionType(change.Compression)
 	opts.DataKey = dk

 	fileID := lc.reserveFileID()
 	fname := table.NewFilename(fileID, lc.kv.opt.Dir)

 	// kv.Value is owned by the z.buffer. Ensure that we copy this buffer.
 	var tbl *table.Table
 	var err error
 	if lc.kv.opt.InMemory {
 		if tbl, err = table.OpenInMemoryTable(y.Copy(kv.Value), fileID, &opts); err != nil {
 			return errors.Wrap(err, "while creating in-memory table from buffer")
 		}
 	} else {
 		if tbl, err = table.CreateTableFromBuffer(fname, kv.Value, opts); err != nil {
 			return errors.Wrap(err, "while creating table from buffer")
 		}
 	}

 	lc.levels[lev].addTable(tbl)
 	// Release the ref held by OpenTable. addTable would add a reference.
 	_ = tbl.DecrRef()

 	change.Id = fileID
 	change.Level = uint32(lev)
 	if dk != nil {
 		change.KeyId = dk.KeyId
 	}
 	// We use the same data KeyId. So, change.KeyId remains the same.
 	y.AssertTrue(change.Op == pb.ManifestChange_CREATE)
 	return lc.kv.manifest.addChanges([]*pb.ManifestChange{change})
 }