blob: 400a6ba66f215b23df909c9ed814cfaa7c76ae9e [file] [log] [blame]
Manish R Jain640f1072017-05-14 10:23:381/*
2 * Copyright 2017 Dgraph Labs, Inc. and Contributors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Manish R Jain6911fde2017-04-16 06:20:1617package badger
18
19import (
Pawan Rawalefb2a4a2017-05-29 05:00:4520 "bytes"
Janardhan Reddyffaaa662017-10-02 06:47:4621 "fmt"
Steven Allen5242a992018-10-04 21:40:5222 "hash/crc32"
Manish R Jain3d225d72020-11-26 02:19:0923 "math"
balaji20900322019-08-23 11:12:4024 "sort"
Manish R Jain6911fde2017-04-16 06:20:1625 "sync"
Deepak Joisa5499e52017-11-02 02:03:1426 "time"
Manish R Jain6911fde2017-04-16 06:20:1627
Joshua Goldstein31b75882023-02-27 22:21:4228 "github.com/dgraph-io/badger/v4/table"
29 "github.com/dgraph-io/badger/v4/y"
Aman Mangal7f657f82024-10-25 16:33:3430 "github.com/dgraph-io/ristretto/v2/z"
Manish R Jain6911fde2017-04-16 06:20:1631)
32
Deepak Joisb9aae1b2017-08-31 05:06:4933type prefetchStatus uint8
34
35const (
Deepak Jois343747b2017-10-31 05:34:2836 prefetched prefetchStatus = iota + 1
Deepak Joisb9aae1b2017-08-31 05:06:4937)
38
Manish R Jainabaad902017-10-04 10:55:5639// Item is returned during iteration. Both the Key() and Value() output is only valid until
Manish R Jain6bb56f22017-05-11 01:04:3340// iterator.Next() is called.
Manish R Jainabaad902017-10-04 10:55:5641type Item struct {
Deepak Joisa5499e52017-11-02 02:03:1442 key []byte
43 vptr []byte
Deepak Joisa5499e52017-11-02 02:03:1444 val []byte
Deepak Joisa5499e52017-11-02 02:03:1445 version uint64
Ibrahim Jarifcdf09c02020-09-20 17:19:4046 expiresAt uint64
47
48 slice *y.Slice // Used only during prefetching.
49 next *Item
50 txn *Txn
51
52 err error
53 wg sync.WaitGroup
54 status prefetchStatus
55 meta byte // We need to store meta to know about bitValuePointer.
56 userMeta byte
Janardhan Reddyffaaa662017-10-02 06:47:4657}
58
Lanre Adelowo8e23f142018-03-09 23:12:4759// String returns a string representation of Item
60func (item *Item) String() string {
61 return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta)
62}
63
Manish R Jain1d625f42017-10-05 00:47:5664// Key returns the key.
65//
66// Key is only valid as long as item is valid, or transaction is valid. If you need to use it
Bertram Truongfc94c572018-10-08 17:57:3867// outside its validity, please use KeyCopy.
Manish R Jainabaad902017-10-04 10:55:5668func (item *Item) Key() []byte {
Janardhan Reddyffaaa662017-10-02 06:47:4669 return item.key
70}
71
Janardhan Reddy8b897692018-03-13 05:23:5372// KeyCopy returns a copy of the key of the item, writing it to dst slice.
73// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
74// returned.
75func (item *Item) KeyCopy(dst []byte) []byte {
76 return y.SafeCopy(dst, item.key)
77}
78
Janardhan Reddyffaaa662017-10-02 06:47:4679// Version returns the commit timestamp of the item.
Manish R Jainabaad902017-10-04 10:55:5680func (item *Item) Version() uint64 {
Janardhan Reddyffaaa662017-10-02 06:47:4681 return item.version
Manish R Jain6911fde2017-04-16 06:20:1682}
83
Manish R Jain1d625f42017-10-05 00:47:5684// Value retrieves the value of the item from the value log.
Deepak Joiscb5a7692017-09-07 09:39:1885//
Deepak Joisa6c4a1c2017-12-19 07:45:2586// This method must be called within a transaction. Calling it outside a
87// transaction is considered undefined behavior. If an iterator is being used,
88// then Item.Value() is defined in the current iteration only, because items are
89// reused.
90//
91// If you need to use a value outside a transaction, please use Item.ValueCopy
Janardhan Reddy269b9f72018-02-26 11:07:1492// instead, or copy it yourself. Value might change once discard or commit is called.
93// Use ValueCopy if you want to do a Set after Get.
Manish R Jain439fd462018-10-04 18:15:0594func (item *Item) Value(fn func(val []byte) error) error {
Manish R Jain0f9030a2017-05-02 03:40:4795 item.wg.Wait()
Deepak Joisb9aae1b2017-08-31 05:06:4996 if item.status == prefetched {
Manish R Jainc10276c2018-09-25 04:13:4897 if item.err == nil && fn != nil {
denkhaus09d06642019-01-04 22:48:4598 if err := fn(item.val); err != nil {
99 return err
100 }
Manish R Jainc10276c2018-09-25 04:13:48101 }
102 return item.err
Deepak Joisb9aae1b2017-08-31 05:06:49103 }
Manish R Jain83aa09d2017-10-04 07:20:27104 buf, cb, err := item.yieldItemValue()
Manish R Jainc10276c2018-09-25 04:13:48105 defer runCallback(cb)
Manish R Jain439fd462018-10-04 18:15:05106 if err != nil {
107 return err
Manish R Jain83aa09d2017-10-04 07:20:27108 }
Manish R Jain439fd462018-10-04 18:15:05109 if fn != nil {
110 return fn(buf)
111 }
112 return nil
Manish R Jain6911fde2017-04-16 06:20:16113}
114
Manish R Jainb3568eb2017-11-13 03:04:41115// ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice.
116// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and
117// returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call.
118//
119// This function is useful in long running iterate/update transactions to avoid a write deadlock.
120// See Github issue: https://github.com/dgraph-io/badger/issues/315
121func (item *Item) ValueCopy(dst []byte) ([]byte, error) {
122 item.wg.Wait()
123 if item.status == prefetched {
124 return y.SafeCopy(dst, item.val), item.err
125 }
126 buf, cb, err := item.yieldItemValue()
127 defer runCallback(cb)
128 return y.SafeCopy(dst, buf), err
129}
130
Manish R Jainabaad902017-10-04 10:55:56131func (item *Item) hasValue() bool {
Manish R Jaind9799672017-09-01 04:19:21132 if item.meta == 0 && item.vptr == nil {
133 // key not found
134 return false
135 }
Manish R Jaind9799672017-09-01 04:19:21136 return true
137}
138
Janardhan Reddy8b897692018-03-13 05:23:53139// IsDeletedOrExpired returns true if item contains deleted or expired value.
140func (item *Item) IsDeletedOrExpired() bool {
141 return isDeletedOrExpired(item.meta, item.expiresAt)
142}
143
Ibrahim Jarif09e9b632019-04-15 10:43:20144// DiscardEarlierVersions returns whether the item was created with the
Martin Martinez Riverab85f5ae2019-01-09 01:50:02145// option to discard earlier versions of a key when multiple are available.
Manish R Jain79c98fc2018-05-05 01:56:43146func (item *Item) DiscardEarlierVersions() bool {
147 return item.meta&bitDiscardEarlierVersions > 0
148}
149
Manish R Jainabaad902017-10-04 10:55:56150func (item *Item) yieldItemValue() ([]byte, func(), error) {
Manish R Jaine201d7b2018-06-04 18:50:41151 key := item.Key() // No need to copy.
Ibrahim Jarif0a5046f2020-09-28 16:31:33152 if !item.hasValue() {
153 return nil, nil, nil
Manish R Jain7af00762018-05-08 23:42:00154 }
Ibrahim Jarif0a5046f2020-09-28 16:31:33155
156 if item.slice == nil {
157 item.slice = new(y.Slice)
158 }
159
160 if (item.meta & bitValuePointer) == 0 {
161 val := item.slice.Resize(len(item.vptr))
162 copy(val, item.vptr)
163 return val, nil, nil
164 }
165
166 var vp valuePointer
167 vp.Decode(item.vptr)
168 db := item.txn.db
169 result, cb, err := db.vlog.Read(vp, item.slice)
170 if err != nil {
Paul Chesnaisda1dcac2023-07-17 22:41:18171 db.opt.Errorf("Unable to read: Key: %v, Version : %v, meta: %v, userMeta: %v"+
Manish R Jaine3a0d292020-10-07 01:41:41172 " Error: %v", key, item.version, item.meta, item.userMeta, err)
Manish R Jain3d225d72020-11-26 02:19:09173 var txn *Txn
174 if db.opt.managedTxns {
175 txn = db.NewTransactionAt(math.MaxUint64, false)
176 } else {
177 txn = db.NewTransaction(false)
178 }
Ibrahim Jariffeb1f5f2020-11-25 13:15:52179 defer txn.Discard()
180
181 iopt := DefaultIteratorOptions
182 iopt.AllVersions = true
183 iopt.InternalAccess = true
184 iopt.PrefetchValues = false
185
186 it := txn.NewKeyIterator(item.Key(), iopt)
187 defer it.Close()
188 for it.Rewind(); it.Valid(); it.Next() {
189 item := it.Item()
190 var vp valuePointer
191 if item.meta&bitValuePointer > 0 {
192 vp.Decode(item.vptr)
193 }
Paul Chesnaisda1dcac2023-07-17 22:41:18194 db.opt.Errorf("Key: %v, Version : %v, meta: %v, userMeta: %v valuePointer: %+v",
Ibrahim Jariffeb1f5f2020-11-25 13:15:52195 item.Key(), item.version, item.meta, item.userMeta, vp)
196 }
Ibrahim Jarif0a5046f2020-09-28 16:31:33197 }
Ibrahim Jariffeb1f5f2020-11-25 13:15:52198 // Don't return error if we cannot read the value. Just log the error.
199 return result, cb, nil
Manish R Jain83aa09d2017-10-04 07:20:27200}
201
202func runCallback(cb func()) {
203 if cb != nil {
204 cb()
205 }
206}
207
Manish R Jainabaad902017-10-04 10:55:56208func (item *Item) prefetchValue() {
Manish R Jain83aa09d2017-10-04 07:20:27209 val, cb, err := item.yieldItemValue()
210 defer runCallback(cb)
211
212 item.err = err
213 item.status = prefetched
214 if val == nil {
215 return
216 }
Manish R Jaine3a0d292020-10-07 01:41:41217 buf := item.slice.Resize(len(val))
218 copy(buf, val)
219 item.val = buf
Deepak Joisb9aae1b2017-08-31 05:06:49220}
221
Bertram Truongfc94c572018-10-08 17:57:38222// EstimatedSize returns the approximate size of the key-value pair.
Deepak Joisb9aae1b2017-08-31 05:06:49223//
224// This can be called while iterating through a store to quickly estimate the
225// size of a range of key-value pairs (without fetching the corresponding
226// values).
Manish R Jainabaad902017-10-04 10:55:56227func (item *Item) EstimatedSize() int64 {
Manish R Jaind9799672017-09-01 04:19:21228 if !item.hasValue() {
229 return 0
230 }
Manish R Jain1d625f42017-10-05 00:47:56231 if (item.meta & bitValuePointer) == 0 {
Manish R Jaind9799672017-09-01 04:19:21232 return int64(len(item.key) + len(item.vptr))
233 }
234 var vp valuePointer
235 vp.Decode(item.vptr)
236 return int64(vp.Len) // includes key length.
237}
238
Ibrahim Jariffd599072019-03-07 20:08:30239// KeySize returns the size of the key.
240// Exact size of the key is key + 8 bytes of timestamp
241func (item *Item) KeySize() int64 {
242 return int64(len(item.key))
243}
244
Ibrahim Jarifd8e1fcf2019-07-19 05:07:03245// ValueSize returns the approximate size of the value.
Steven Allen5242a992018-10-04 21:40:52246//
247// This can be called to quickly estimate the size of a value without fetching
248// it.
249func (item *Item) ValueSize() int64 {
250 if !item.hasValue() {
251 return 0
252 }
253 if (item.meta & bitValuePointer) == 0 {
254 return int64(len(item.vptr))
255 }
256 var vp valuePointer
257 vp.Decode(item.vptr)
Manish R Jain51995642018-10-04 21:50:35258
259 klen := int64(len(item.key) + 8) // 8 bytes for timestamp.
Ibrahim Jarifd8e1fcf2019-07-19 05:07:03260 // 6 bytes are for the approximate length of the header. Since header is encoded in varint, we
261 // cannot find the exact length of header without fetching it.
262 return int64(vp.Len) - klen - 6 - crc32.Size
Steven Allen5242a992018-10-04 21:40:52263}
264
Manish R Jain730b6aa2017-08-31 08:07:35265// UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user
266// is used to interpret the value.
Manish R Jainabaad902017-10-04 10:55:56267func (item *Item) UserMeta() byte {
Janardhan Reddy835ef242017-07-27 00:19:14268 return item.userMeta
269}
270
Deepak Joisa5499e52017-11-02 02:03:14271// ExpiresAt returns a Unix time value indicating when the item will be
272// considered expired. 0 indicates that the item will never expire.
273func (item *Item) ExpiresAt() uint64 {
274 return item.expiresAt
275}
276
Manish R Jain83aa09d2017-10-04 07:20:27277// TODO: Switch this to use linked list container in Go.
Manish R Jain0f9030a2017-05-02 03:40:47278type list struct {
Manish R Jainabaad902017-10-04 10:55:56279 head *Item
280 tail *Item
Manish R Jain6911fde2017-04-16 06:20:16281}
282
Manish R Jainabaad902017-10-04 10:55:56283func (l *list) push(i *Item) {
Manish R Jain0f9030a2017-05-02 03:40:47284 i.next = nil
285 if l.tail == nil {
286 l.head = i
287 l.tail = i
Manish R Jain6911fde2017-04-16 06:20:16288 return
289 }
Manish R Jain0f9030a2017-05-02 03:40:47290 l.tail.next = i
291 l.tail = i
292}
293
Manish R Jainabaad902017-10-04 10:55:56294func (l *list) pop() *Item {
Manish R Jain0f9030a2017-05-02 03:40:47295 if l.head == nil {
296 return nil
Manish R Jain6911fde2017-04-16 06:20:16297 }
Manish R Jain0f9030a2017-05-02 03:40:47298 i := l.head
299 if l.head == l.tail {
300 l.tail = nil
301 l.head = nil
302 } else {
303 l.head = i.next
304 }
305 i.next = nil
306 return i
307}
308
Deepak Jois9bd2f1c2017-10-05 10:21:54309// IteratorOptions is used to set options when iterating over Badger key-value
310// stores.
311//
312// This package provides DefaultIteratorOptions which contains options that
313// should work for most applications. Consider using that as a starting point
314// before customizing it for your own needs.
Manish R Jain0f9030a2017-05-02 03:40:47315type IteratorOptions struct {
Ibrahim Jarifcdf09c02020-09-20 17:19:40316 // PrefetchSize is the number of KV pairs to prefetch while iterating.
317 // Valid only if PrefetchValues is true.
Deepak Joisb9aae1b2017-08-31 05:06:49318 PrefetchSize int
Ibrahim Jarifcdf09c02020-09-20 17:19:40319 // PrefetchValues Indicates whether we should prefetch values during
320 // iteration and store them.
321 PrefetchValues bool
322 Reverse bool // Direction of iteration. False is forward, true is backward.
323 AllVersions bool // Fetch all valid versions of the same key.
Ibrahim Jariffb2eed92019-06-12 08:15:53324 InternalAccess bool // Used to allow internal access to badger keys.
Ibrahim Jarifcdf09c02020-09-20 17:19:40325
326 // The following option is used to narrow down the SSTables that iterator
327 // picks up. If Prefix is specified, only tables which could have this
328 // prefix are picked based on their range of keys.
329 prefixIsKey bool // If set, use the prefix for bloom filter lookup.
330 Prefix []byte // Only iterate over this given prefix.
Ibrahim Jarif31c061e2021-02-05 12:49:34331 SinceTs uint64 // Only read data that has version > SinceTs.
Manish R Jain6911fde2017-04-16 06:20:16332}
333
balaji20900322019-08-23 11:12:40334func (opt *IteratorOptions) compareToPrefix(key []byte) int {
335 // We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix.
336 key = y.ParseKey(key)
337 if len(key) > len(opt.Prefix) {
338 key = key[:len(opt.Prefix)]
339 }
340 return bytes.Compare(key, opt.Prefix)
341}
342
Martin Martinez Riverab85f5ae2019-01-09 01:50:02343func (opt *IteratorOptions) pickTable(t table.TableInterface) bool {
Ibrahim Jarif31c061e2021-02-05 12:49:34344 // Ignore this table if its max version is less than the sinceTs.
345 if t.MaxVersion() < opt.SinceTs {
346 return false
347 }
Manish R Jain7d460292018-11-25 19:53:03348 if len(opt.Prefix) == 0 {
349 return true
350 }
balaji20900322019-08-23 11:12:40351 if opt.compareToPrefix(t.Smallest()) > 0 {
Manish R Jain49a49e32018-11-26 21:07:12352 return false
353 }
balaji20900322019-08-23 11:12:40354 if opt.compareToPrefix(t.Biggest()) < 0 {
Manish R Jain49a49e32018-11-26 21:07:12355 return false
356 }
357 // Bloom filter lookup would only work if opt.Prefix does NOT have the read
358 // timestamp as part of the key.
Ibrahim Jarif599363b2020-10-03 16:34:58359 if opt.prefixIsKey && t.DoesNotHave(y.Hash(opt.Prefix)) {
Manish R Jain49a49e32018-11-26 21:07:12360 return false
361 }
362 return true
Manish R Jain7d460292018-11-25 19:53:03363}
364
balaji20900322019-08-23 11:12:40365// pickTables picks the necessary table for the iterator. This function also assumes
366// that the tables are sorted in the right order.
367func (opt *IteratorOptions) pickTables(all []*table.Table) []*table.Table {
Ibrahim Jarif31c061e2021-02-05 12:49:34368 filterTables := func(tables []*table.Table) []*table.Table {
Aman Mangalffd74f32023-02-14 15:51:41369 if opt.SinceTs == 0 {
370 return tables
Ibrahim Jarif31c061e2021-02-05 12:49:34371 }
Aman Mangalffd74f32023-02-14 15:51:41372 out := tables[:0]
373 for _, t := range tables {
374 if t.MaxVersion() < opt.SinceTs {
375 continue
376 }
377 out = append(out, t)
378 }
379 return out
Ibrahim Jarif31c061e2021-02-05 12:49:34380 }
381
balaji20900322019-08-23 11:12:40382 if len(opt.Prefix) == 0 {
383 out := make([]*table.Table, len(all))
384 copy(out, all)
Ibrahim Jarif31c061e2021-02-05 12:49:34385 return filterTables(out)
balaji20900322019-08-23 11:12:40386 }
387 sIdx := sort.Search(len(all), func(i int) bool {
Manish R Jain74f2e022020-12-03 23:30:11388 // table.Biggest >= opt.prefix
389 // if opt.Prefix < table.Biggest, then surely it is not in any of the preceding tables.
balaji20900322019-08-23 11:12:40390 return opt.compareToPrefix(all[i].Biggest()) >= 0
391 })
392 if sIdx == len(all) {
393 // Not found.
394 return []*table.Table{}
395 }
396
397 filtered := all[sIdx:]
398 if !opt.prefixIsKey {
399 eIdx := sort.Search(len(filtered), func(i int) bool {
400 return opt.compareToPrefix(filtered[i].Smallest()) > 0
401 })
402 out := make([]*table.Table, len(filtered[:eIdx]))
403 copy(out, filtered[:eIdx])
Ibrahim Jarif31c061e2021-02-05 12:49:34404 return filterTables(out)
balaji20900322019-08-23 11:12:40405 }
406
Manish R Jain74f2e022020-12-03 23:30:11407 // opt.prefixIsKey == true. This code is optimizing for opt.prefixIsKey part.
balaji20900322019-08-23 11:12:40408 var out []*table.Table
Manish R Jain74f2e022020-12-03 23:30:11409 hash := y.Hash(opt.Prefix)
balaji20900322019-08-23 11:12:40410 for _, t := range filtered {
Manish R Jain74f2e022020-12-03 23:30:11411 // When we encounter the first table whose smallest key is higher than opt.Prefix, we can
412 // stop. This is an IMPORTANT optimization, just considering how often we call
413 // NewKeyIterator.
414 if opt.compareToPrefix(t.Smallest()) > 0 {
415 // if table.Smallest > opt.Prefix, then this and all tables after this can be ignored.
416 break
balaji20900322019-08-23 11:12:40417 }
Manish R Jain74f2e022020-12-03 23:30:11418 // opt.Prefix is actually the key. So, we can run bloom filter checks
419 // as well.
420 if t.DoesNotHave(hash) {
421 continue
422 }
423 out = append(out, t)
balaji20900322019-08-23 11:12:40424 }
Ibrahim Jarif31c061e2021-02-05 12:49:34425 return filterTables(out)
balaji20900322019-08-23 11:12:40426}
427
Deepak Jois13e687b2017-08-29 10:28:52428// DefaultIteratorOptions contains default options when iterating over Badger key-value stores.
Manish R Jain83151362017-05-11 01:50:48429var DefaultIteratorOptions = IteratorOptions{
Deepak Jois26638252017-09-11 09:02:29430 PrefetchValues: true,
Deepak Joisb9aae1b2017-08-31 05:06:49431 PrefetchSize: 100,
432 Reverse: false,
Manish R Jain90be3872017-10-02 08:22:55433 AllVersions: false,
Manish R Jain83151362017-05-11 01:50:48434}
435
436// Iterator helps iterating over the KV pairs in a lexicographically sorted order.
Manish R Jain6911fde2017-04-16 06:20:16437type Iterator struct {
Ibrahim Jarif73ea6e62019-10-21 10:33:21438 iitr y.Iterator
Manish R Jain50a2e6d2017-09-28 00:33:28439 txn *Txn
440 readTs uint64
Manish R Jain4ab15482017-04-28 09:38:01441
Manish R Jain0f9030a2017-05-02 03:40:47442 opt IteratorOptions
Manish R Jainabaad902017-10-04 10:55:56443 item *Item
Manish R Jain0f9030a2017-05-02 03:40:47444 data list
445 waste list
Manish R Jain50a2e6d2017-09-28 00:33:28446
447 lastKey []byte // Used to skip over multiple versions of the same key.
Manish R Jain41d96562018-09-25 19:40:45448
Naman Jain38eb5a12021-01-07 13:18:00449 closed bool
450 scanned int // Used to estimate the size of data scanned by iterator.
Martin Martinez Rivera6eaa5002020-05-22 17:45:26451
452 // ThreadId is an optional value that can be set to identify which goroutine created
453 // the iterator. It can be used, for example, to uniquely identify each of the
454 // iterators created by the stream interface
455 ThreadId int
Manish R Jain70088c62020-12-03 03:30:43456
Manish R Jainb80c7922020-12-04 21:38:44457 Alloc *z.Allocator
Manish R Jain4ab15482017-04-28 09:38:01458}
459
Manish R Jain1d625f42017-10-05 00:47:56460// NewIterator returns a new iterator. Depending upon the options, either only keys, or both
461// key-value pairs would be fetched. The keys are returned in lexicographically sorted order.
Manish R Jain41d96562018-09-25 19:40:45462// Using prefetch is recommended if you're doing a long running iteration, for performance.
463//
464// Multiple Iterators:
Aman Mangalb1ea3602023-02-22 17:36:03465// For a read-only txn, multiple iterators can be running simultaneously. However, for a read-write
Elliot Courantaf22dfd2020-05-13 06:07:33466// txn, iterators have the nuance of being a snapshot of the writes for the transaction at the time
467// iterator was created. If writes are performed after an iterator is created, then that iterator
468// will not be able to see those writes. Only writes performed before an iterator was created can be
469// viewed.
Manish R Jain1d625f42017-10-05 00:47:56470func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator {
Manish R Jain41d96562018-09-25 19:40:45471 if txn.discarded {
Aman Mangalb1ea3602023-02-22 17:36:03472 panic(ErrDiscardedTxn)
Manish R Jain41d96562018-09-25 19:40:45473 }
Ibrahim Jarif1e21a942020-08-26 10:03:36474 if txn.db.IsClosed() {
Aman Mangalb1ea3602023-02-22 17:36:03475 panic(ErrDBClosed)
Ibrahim Jarif1e21a942020-08-26 10:03:36476 }
Elliot Courantaf22dfd2020-05-13 06:07:33477
Harshil Goelec80d3d2023-07-18 05:54:55478 y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1)
479
Elliot Courantaf22dfd2020-05-13 06:07:33480 // Keep track of the number of active iterators.
Aman Mangalb1ea3602023-02-22 17:36:03481 txn.numIterators.Add(1)
Manish R Jainb1ad1e92018-06-19 00:07:00482
Aman Mangalb1ea3602023-02-22 17:36:03483 // TODO: If Prefix is set, only pick those memtables which have keys with the prefix.
Manish R Jain1d625f42017-10-05 00:47:56484 tables, decr := txn.db.getMemTables()
485 defer decr()
486 txn.db.vlog.incrIteratorCount()
487 var iters []y.Iterator
Janardhan Reddy097bd7a2017-11-28 01:20:36488 if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil {
489 iters = append(iters, itr)
490 }
Manish R Jain1d625f42017-10-05 00:47:56491 for i := 0; i < len(tables); i++ {
Manish R Jaine3a0d292020-10-07 01:41:41492 iters = append(iters, tables[i].sl.NewUniIterator(opt.Reverse))
Manish R Jain1d625f42017-10-05 00:47:56493 }
Aman Mangalffd74f32023-02-14 15:51:41494 iters = append(iters, txn.db.lc.iterators(&opt)...) // This will increment references.
Manish R Jain1d625f42017-10-05 00:47:56495 res := &Iterator{
496 txn: txn,
Ibrahim Jarif73ea6e62019-10-21 10:33:21497 iitr: table.NewMergeIterator(iters, opt.Reverse),
Manish R Jain1d625f42017-10-05 00:47:56498 opt: opt,
499 readTs: txn.readTs,
500 }
501 return res
502}
503
Manish R Jain49a49e32018-11-26 21:07:12504// NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a
505// single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to
506// additionally run bloom filter lookups before picking tables from the LSM tree.
507func (txn *Txn) NewKeyIterator(key []byte, opt IteratorOptions) *Iterator {
508 if len(opt.Prefix) > 0 {
509 panic("opt.Prefix should be nil for NewKeyIterator.")
510 }
511 opt.Prefix = key // This key must be without the timestamp.
512 opt.prefixIsKey = true
Ibrahim Jarifa11c5e02019-08-01 13:53:46513 opt.AllVersions = true
Manish R Jain49a49e32018-11-26 21:07:12514 return txn.NewIterator(opt)
515}
516
Manish R Jainabaad902017-10-04 10:55:56517func (it *Iterator) newItem() *Item {
Manish R Jain0f9030a2017-05-02 03:40:47518 item := it.waste.pop()
519 if item == nil {
Ibrahim Jarifcdf09c02020-09-20 17:19:40520 item = &Item{slice: new(y.Slice), txn: it.txn}
Manish R Jain4ab15482017-04-28 09:38:01521 }
Manish R Jain4ab15482017-04-28 09:38:01522 return item
Manish R Jain6911fde2017-04-16 06:20:16523}
524
Manish R Jain1d625f42017-10-05 00:47:56525// Item returns pointer to the current key-value pair.
Manish R Jain83151362017-05-11 01:50:48526// This item is only valid until it.Next() gets called.
Manish R Jainabaad902017-10-04 10:55:56527func (it *Iterator) Item() *Item {
Manish R Jain50a2e6d2017-09-28 00:33:28528 tx := it.txn
Manish R Jain41d96562018-09-25 19:40:45529 tx.addReadKey(it.item.Key())
Manish R Jain50a2e6d2017-09-28 00:33:28530 return it.item
531}
Manish R Jain0f9030a2017-05-02 03:40:47532
Manish R Jain83151362017-05-11 01:50:48533// Valid returns false when iteration is done.
Manish R Jain7d460292018-11-25 19:53:03534func (it *Iterator) Valid() bool {
535 if it.item == nil {
536 return false
537 }
Ibrahim Jarifa11c5e02019-08-01 13:53:46538 if it.opt.prefixIsKey {
539 return bytes.Equal(it.item.key, it.opt.Prefix)
540 }
Manish R Jain7d460292018-11-25 19:53:03541 return bytes.HasPrefix(it.item.key, it.opt.Prefix)
542}
Manish R Jain0f9030a2017-05-02 03:40:47543
Salim Alamieaad7eb2017-06-28 01:16:34544// ValidForPrefix returns false when iteration is done
545// or when the current key is not prefixed by the specified prefix.
546func (it *Iterator) ValidForPrefix(prefix []byte) bool {
Manish R Jain7d460292018-11-25 19:53:03547 return it.Valid() && bytes.HasPrefix(it.item.key, prefix)
Salim Alamieaad7eb2017-06-28 01:16:34548}
549
Manish R Jain83151362017-05-11 01:50:48550// Close would close the iterator. It is important to call this when you're done with iteration.
Manish R Jain41b1cd22017-05-03 01:12:24551func (it *Iterator) Close() {
Manish R Jain41d96562018-09-25 19:40:45552 if it.closed {
553 return
554 }
555 it.closed = true
Naman Jain195b2122021-02-22 15:59:19556 if it.iitr == nil {
Aman Mangalb1ea3602023-02-22 17:36:03557 it.txn.numIterators.Add(-1)
Naman Jain195b2122021-02-22 15:59:19558 return
559 }
Manish R Jain8d041dd2018-02-27 00:19:34560
Manish R Jain41d96562018-09-25 19:40:45561 it.iitr.Close()
Manish R Jain8d041dd2018-02-27 00:19:34562 // It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie
563 // goroutines behind, which are waiting to acquire file read locks after DB has been closed.
564 waitFor := func(l list) {
565 item := l.pop()
566 for item != nil {
567 item.wg.Wait()
568 item = l.pop()
569 }
570 }
571 waitFor(it.waste)
572 waitFor(it.data)
573
Sam Hughes9ed12b92017-09-14 05:15:36574 // TODO: We could handle this error.
Manish R Jainabaad902017-10-04 10:55:56575 _ = it.txn.db.vlog.decrIteratorCount()
Aman Mangalb1ea3602023-02-22 17:36:03576 it.txn.numIterators.Add(-1)
Manish R Jain41b1cd22017-05-03 01:12:24577}
578
Manish R Jain83151362017-05-11 01:50:48579// Next would advance the iterator by one. Always check it.Valid() after a Next()
580// to ensure you have access to a valid it.Item().
Manish R Jain0f9030a2017-05-02 03:40:47581func (it *Iterator) Next() {
Naman Jain195b2122021-02-22 15:59:19582 if it.iitr == nil {
583 return
584 }
Manish R Jain0f9030a2017-05-02 03:40:47585 // Reuse current item
586 it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting.
Naman Jain38eb5a12021-01-07 13:18:00587 it.scanned += len(it.item.key) + len(it.item.val) + len(it.item.vptr) + 2
Manish R Jain0f9030a2017-05-02 03:40:47588 it.waste.push(it.item)
589
590 // Set next item to current
591 it.item = it.data.pop()
Harshil Goelb77f2e82024-09-12 07:12:55592 for it.iitr.Valid() && hasPrefix(it) {
Manish R Jain50a2e6d2017-09-28 00:33:28593 if it.parseItem() {
594 // parseItem calls one extra next.
595 // This is used to deal with the complexity of reverse iteration.
Szymon921c1532017-05-19 10:21:50596 break
597 }
Manish R Jain4ab15482017-04-28 09:38:01598 }
Manish R Jain50a2e6d2017-09-28 00:33:28599}
Manish R Jain306b99d2017-05-29 04:27:42600
Janardhan Reddya0576272017-12-19 07:26:36601func isDeletedOrExpired(meta byte, expiresAt uint64) bool {
602 if meta&bitDelete > 0 {
Deepak Joisa5499e52017-11-02 02:03:14603 return true
604 }
Janardhan Reddya0576272017-12-19 07:26:36605 if expiresAt == 0 {
Deepak Joisa5499e52017-11-02 02:03:14606 return false
607 }
Janardhan Reddya0576272017-12-19 07:26:36608 return expiresAt <= uint64(time.Now().Unix())
Deepak Joisa5499e52017-11-02 02:03:14609}
610
Manish R Jain50a2e6d2017-09-28 00:33:28611// parseItem is a complex function because it needs to handle both forward and reverse iteration
612// implementation. We store keys such that their versions are sorted in descending order. This makes
613// forward iteration efficient, but revese iteration complicated. This tradeoff is better because
Naman Jain3e5ab7c2021-01-25 09:27:29614// forward iteration is more common than reverse. It returns true, if either the iterator is invalid
615// or it has pushed an item into it.data list, else it returns false.
Manish R Jain50a2e6d2017-09-28 00:33:28616//
617// This function advances the iterator.
618func (it *Iterator) parseItem() bool {
619 mi := it.iitr
620 key := mi.Key()
621
Manish R Jainabaad902017-10-04 10:55:56622 setItem := func(item *Item) {
Manish R Jain90be3872017-10-02 08:22:55623 if it.item == nil {
624 it.item = item
625 } else {
626 it.data.push(item)
627 }
628 }
629
Naman Jain3e5ab7c2021-01-25 09:27:29630 isInternalKey := bytes.HasPrefix(key, badgerPrefix)
Manish R Jain50a2e6d2017-09-28 00:33:28631 // Skip badger keys.
Naman Jain3e5ab7c2021-01-25 09:27:29632 if !it.opt.InternalAccess && isInternalKey {
Manish R Jain50a2e6d2017-09-28 00:33:28633 mi.Next()
634 return false
Manish R Jain6911fde2017-04-16 06:20:16635 }
Manish R Jain50a2e6d2017-09-28 00:33:28636
637 // Skip any versions which are beyond the readTs.
638 version := y.ParseTs(key)
Ibrahim Jarif31c061e2021-02-05 12:49:34639 // Ignore everything that is above the readTs and below or at the sinceTs.
Naman Jaine1f9dce2021-07-08 08:14:04640 if version > it.readTs || (it.opt.SinceTs > 0 && version <= it.opt.SinceTs) {
Manish R Jain50a2e6d2017-09-28 00:33:28641 mi.Next()
642 return false
643 }
644
Naman Jain3e5ab7c2021-01-25 09:27:29645 // Skip banned keys only if it does not have badger internal prefix.
646 if !isInternalKey && it.txn.db.isBanned(key) != nil {
647 mi.Next()
648 return false
649 }
650
Manish R Jain90be3872017-10-02 08:22:55651 if it.opt.AllVersions {
Janardhan Reddya0576272017-12-19 07:26:36652 // Return deleted or expired values also, otherwise user can't figure out
653 // whether the key was deleted.
Manish R Jain90be3872017-10-02 08:22:55654 item := it.newItem()
655 it.fill(item)
656 setItem(item)
657 mi.Next()
658 return true
659 }
660
Manish R Jain50a2e6d2017-09-28 00:33:28661 // If iterating in forward direction, then just checking the last key against current key would
662 // be sufficient.
663 if !it.opt.Reverse {
664 if y.SameKey(it.lastKey, key) {
665 mi.Next()
666 return false
667 }
668 // Only track in forward direction.
669 // We should update lastKey as soon as we find a different key in our snapshot.
670 // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a.
671 // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5,
672 // which is wrong. Therefore, update lastKey here.
Manish R Jainb3568eb2017-11-13 03:04:41673 it.lastKey = y.SafeCopy(it.lastKey, mi.Key())
Manish R Jain50a2e6d2017-09-28 00:33:28674 }
675
676FILL:
677 // If deleted, advance and return.
Janardhan Reddya0576272017-12-19 07:26:36678 vs := mi.Value()
679 if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
Manish R Jain50a2e6d2017-09-28 00:33:28680 mi.Next()
681 return false
682 }
683
Manish R Jain0f9030a2017-05-02 03:40:47684 item := it.newItem()
685 it.fill(item)
Manish R Jain50a2e6d2017-09-28 00:33:28686 // fill item based on current cursor position. All Next calls have returned, so reaching here
687 // means no Next was called.
688
689 mi.Next() // Advance but no fill item yet.
690 if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid.
Manish R Jain90be3872017-10-02 08:22:55691 setItem(item)
Manish R Jain50a2e6d2017-09-28 00:33:28692 return true
693 }
694
695 // Reverse direction.
696 nextTs := y.ParseTs(mi.Key())
Janardhan Reddyffaaa662017-10-02 06:47:46697 mik := y.ParseKey(mi.Key())
Hiroaki Nakamurac6414b12017-10-14 10:31:40698 if nextTs <= it.readTs && bytes.Equal(mik, item.key) {
Manish R Jain50a2e6d2017-09-28 00:33:28699 // This is a valid potential candidate.
700 goto FILL
701 }
702 // Ignore the next candidate. Return the current one.
Manish R Jain90be3872017-10-02 08:22:55703 setItem(item)
Manish R Jain50a2e6d2017-09-28 00:33:28704 return true
Manish R Jain0f9030a2017-05-02 03:40:47705}
Manish R Jain6911fde2017-04-16 06:20:16706
Manish R Jainabaad902017-10-04 10:55:56707func (it *Iterator) fill(item *Item) {
Manish R Jain0f9030a2017-05-02 03:40:47708 vs := it.iitr.Value()
709 item.meta = vs.Meta
Janardhan Reddy835ef242017-07-27 00:19:14710 item.userMeta = vs.UserMeta
Deepak Joisa5499e52017-11-02 02:03:14711 item.expiresAt = vs.ExpiresAt
Janardhan Reddyffaaa662017-10-02 06:47:46712
713 item.version = y.ParseTs(it.iitr.Key())
Manish R Jainb3568eb2017-11-13 03:04:41714 item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key()))
Janardhan Reddyffaaa662017-10-02 06:47:46715
Manish R Jainb3568eb2017-11-13 03:04:41716 item.vptr = y.SafeCopy(item.vptr, vs.Value)
Deepak Jois71abccc2017-08-23 10:26:26717 item.val = nil
Deepak Joisb9aae1b2017-08-31 05:06:49718 if it.opt.PrefetchValues {
Manish R Jain0f9030a2017-05-02 03:40:47719 item.wg.Add(1)
Manish R Jain55c350d2017-05-30 10:04:50720 go func() {
Deepak Joisb9aae1b2017-08-31 05:06:49721 // FIXME we are not handling errors here.
722 item.prefetchValue()
Manish R Jain55c350d2017-05-30 10:04:50723 item.wg.Done()
724 }()
Manish R Jain0f9030a2017-05-02 03:40:47725 }
726}
Manish R Jain6911fde2017-04-16 06:20:16727
Harshil Goelb77f2e82024-09-12 07:12:55728func hasPrefix(it *Iterator) bool {
729 // We shouldn't check prefix in case the iterator is going in reverse. Since in reverse we expect
730 // people to append items to the end of prefix.
731 if !it.opt.Reverse && len(it.opt.Prefix) > 0 {
732 return bytes.HasPrefix(y.ParseKey(it.iitr.Key()), it.opt.Prefix)
Harshil Goel2c148fe2024-08-14 12:35:42733 }
734 return true
735}
736
Manish R Jain0f9030a2017-05-02 03:40:47737func (it *Iterator) prefetch() {
Deepak Joisb9aae1b2017-08-31 05:06:49738 prefetchSize := 2
739 if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 {
740 prefetchSize = it.opt.PrefetchSize
Pawan Rawal748aadf2017-08-18 07:22:24741 }
742
Manish R Jain0f9030a2017-05-02 03:40:47743 i := it.iitr
744 var count int
745 it.item = nil
Harshil Goelb77f2e82024-09-12 07:12:55746 for i.Valid() && hasPrefix(it) {
Manish R Jain50a2e6d2017-09-28 00:33:28747 if !it.parseItem() {
Szymon921c1532017-05-19 10:21:50748 continue
749 }
Manish R Jain0f9030a2017-05-02 03:40:47750 count++
Pawan Rawal748aadf2017-08-18 07:22:24751 if count == prefetchSize {
Manish R Jain0f9030a2017-05-02 03:40:47752 break
Manish R Jain6911fde2017-04-16 06:20:16753 }
754 }
755}
756
Martin Martinez Rivera3e0e35e2019-06-07 17:13:10757// Seek would seek to the provided key if present. If absent, it would seek to the next
758// smallest key greater than the provided key if iterating in the forward direction.
759// Behavior would be reversed if iterating backwards.
Manish R Jain0f9030a2017-05-02 03:40:47760func (it *Iterator) Seek(key []byte) {
Naman Jain195b2122021-02-22 15:59:19761 if it.iitr == nil {
762 return
763 }
Ibrahim Jarifda80eb92020-06-01 10:28:18764 if len(key) > 0 {
765 it.txn.addReadKey(key)
766 }
Pawan Rawalb50cd8f2017-05-20 10:32:58767 for i := it.data.pop(); i != nil; i = it.data.pop() {
Manish R Jain0f9030a2017-05-02 03:40:47768 i.wg.Wait()
769 it.waste.push(i)
Manish R Jain6911fde2017-04-16 06:20:16770 }
Janardhan Reddy9b31d1a2017-10-03 23:53:02771
772 it.lastKey = it.lastKey[:0]
773 if len(key) == 0 {
Manish R Jain7d460292018-11-25 19:53:03774 key = it.opt.Prefix
775 }
776 if len(key) == 0 {
Janardhan Reddy9b31d1a2017-10-03 23:53:02777 it.iitr.Rewind()
778 it.prefetch()
779 return
780 }
781
782 if !it.opt.Reverse {
783 key = y.KeyWithTs(key, it.txn.readTs)
784 } else {
785 key = y.KeyWithTs(key, 0)
786 }
Manish R Jain0f9030a2017-05-02 03:40:47787 it.iitr.Seek(key)
788 it.prefetch()
Manish R Jain6911fde2017-04-16 06:20:16789}
790
Manish R Jain83151362017-05-11 01:50:48791// Rewind would rewind the iterator cursor all the way to zero-th position, which would be the
792// smallest key if iterating forward, and largest if iterating backward. It does not keep track of
793// whether the cursor started with a Seek().
Manish R Jain0f9030a2017-05-02 03:40:47794func (it *Iterator) Rewind() {
Manish R Jain7d460292018-11-25 19:53:03795 it.Seek(nil)
Manish R Jain0f9030a2017-05-02 03:40:47796}