Manish R Jain | 640f107 | 2017-05-14 10:23:38 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2017 Dgraph Labs, Inc. and Contributors |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 17 | package badger |
| 18 | |
| 19 | import ( |
Pawan Rawal | efb2a4a | 2017-05-29 05:00:45 | [diff] [blame] | 20 | "bytes" |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 21 | "fmt" |
Steven Allen | 5242a99 | 2018-10-04 21:40:52 | [diff] [blame] | 22 | "hash/crc32" |
Manish R Jain | 3d225d7 | 2020-11-26 02:19:09 | [diff] [blame] | 23 | "math" |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 24 | "sort" |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 25 | "sync" |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 26 | "time" |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 27 | |
Joshua Goldstein | 31b7588 | 2023-02-27 22:21:42 | [diff] [blame] | 28 | "github.com/dgraph-io/badger/v4/table" |
| 29 | "github.com/dgraph-io/badger/v4/y" |
Aman Mangal | 7f657f8 | 2024-10-25 16:33:34 | [diff] [blame] | 30 | "github.com/dgraph-io/ristretto/v2/z" |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 31 | ) |
| 32 | |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 33 | type prefetchStatus uint8 |
| 34 | |
| 35 | const ( |
Deepak Jois | 343747b | 2017-10-31 05:34:28 | [diff] [blame] | 36 | prefetched prefetchStatus = iota + 1 |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 37 | ) |
| 38 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 39 | // Item is returned during iteration. Both the Key() and Value() output is only valid until |
Manish R Jain | 6bb56f2 | 2017-05-11 01:04:33 | [diff] [blame] | 40 | // iterator.Next() is called. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 41 | type Item struct { |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 42 | key []byte |
| 43 | vptr []byte |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 44 | val []byte |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 45 | version uint64 |
Ibrahim Jarif | cdf09c0 | 2020-09-20 17:19:40 | [diff] [blame] | 46 | expiresAt uint64 |
| 47 | |
| 48 | slice *y.Slice // Used only during prefetching. |
| 49 | next *Item |
| 50 | txn *Txn |
| 51 | |
| 52 | err error |
| 53 | wg sync.WaitGroup |
| 54 | status prefetchStatus |
| 55 | meta byte // We need to store meta to know about bitValuePointer. |
| 56 | userMeta byte |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 57 | } |
| 58 | |
Lanre Adelowo | 8e23f14 | 2018-03-09 23:12:47 | [diff] [blame] | 59 | // String returns a string representation of Item |
| 60 | func (item *Item) String() string { |
| 61 | return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta) |
| 62 | } |
| 63 | |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 64 | // Key returns the key. |
| 65 | // |
| 66 | // Key is only valid as long as item is valid, or transaction is valid. If you need to use it |
Bertram Truong | fc94c57 | 2018-10-08 17:57:38 | [diff] [blame] | 67 | // outside its validity, please use KeyCopy. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 68 | func (item *Item) Key() []byte { |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 69 | return item.key |
| 70 | } |
| 71 | |
Janardhan Reddy | 8b89769 | 2018-03-13 05:23:53 | [diff] [blame] | 72 | // KeyCopy returns a copy of the key of the item, writing it to dst slice. |
| 73 | // If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and |
| 74 | // returned. |
| 75 | func (item *Item) KeyCopy(dst []byte) []byte { |
| 76 | return y.SafeCopy(dst, item.key) |
| 77 | } |
| 78 | |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 79 | // Version returns the commit timestamp of the item. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 80 | func (item *Item) Version() uint64 { |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 81 | return item.version |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 82 | } |
| 83 | |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 84 | // Value retrieves the value of the item from the value log. |
Deepak Jois | cb5a769 | 2017-09-07 09:39:18 | [diff] [blame] | 85 | // |
Deepak Jois | a6c4a1c | 2017-12-19 07:45:25 | [diff] [blame] | 86 | // This method must be called within a transaction. Calling it outside a |
| 87 | // transaction is considered undefined behavior. If an iterator is being used, |
| 88 | // then Item.Value() is defined in the current iteration only, because items are |
| 89 | // reused. |
| 90 | // |
| 91 | // If you need to use a value outside a transaction, please use Item.ValueCopy |
Janardhan Reddy | 269b9f7 | 2018-02-26 11:07:14 | [diff] [blame] | 92 | // instead, or copy it yourself. Value might change once discard or commit is called. |
| 93 | // Use ValueCopy if you want to do a Set after Get. |
Manish R Jain | 439fd46 | 2018-10-04 18:15:05 | [diff] [blame] | 94 | func (item *Item) Value(fn func(val []byte) error) error { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 95 | item.wg.Wait() |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 96 | if item.status == prefetched { |
Manish R Jain | c10276c | 2018-09-25 04:13:48 | [diff] [blame] | 97 | if item.err == nil && fn != nil { |
denkhaus | 09d0664 | 2019-01-04 22:48:45 | [diff] [blame] | 98 | if err := fn(item.val); err != nil { |
| 99 | return err |
| 100 | } |
Manish R Jain | c10276c | 2018-09-25 04:13:48 | [diff] [blame] | 101 | } |
| 102 | return item.err |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 103 | } |
Manish R Jain | 83aa09d | 2017-10-04 07:20:27 | [diff] [blame] | 104 | buf, cb, err := item.yieldItemValue() |
Manish R Jain | c10276c | 2018-09-25 04:13:48 | [diff] [blame] | 105 | defer runCallback(cb) |
Manish R Jain | 439fd46 | 2018-10-04 18:15:05 | [diff] [blame] | 106 | if err != nil { |
| 107 | return err |
Manish R Jain | 83aa09d | 2017-10-04 07:20:27 | [diff] [blame] | 108 | } |
Manish R Jain | 439fd46 | 2018-10-04 18:15:05 | [diff] [blame] | 109 | if fn != nil { |
| 110 | return fn(buf) |
| 111 | } |
| 112 | return nil |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 113 | } |
| 114 | |
Manish R Jain | b3568eb | 2017-11-13 03:04:41 | [diff] [blame] | 115 | // ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice. |
| 116 | // If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and |
| 117 | // returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call. |
| 118 | // |
| 119 | // This function is useful in long running iterate/update transactions to avoid a write deadlock. |
| 120 | // See Github issue: https://github.com/dgraph-io/badger/issues/315 |
| 121 | func (item *Item) ValueCopy(dst []byte) ([]byte, error) { |
| 122 | item.wg.Wait() |
| 123 | if item.status == prefetched { |
| 124 | return y.SafeCopy(dst, item.val), item.err |
| 125 | } |
| 126 | buf, cb, err := item.yieldItemValue() |
| 127 | defer runCallback(cb) |
| 128 | return y.SafeCopy(dst, buf), err |
| 129 | } |
| 130 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 131 | func (item *Item) hasValue() bool { |
Manish R Jain | d979967 | 2017-09-01 04:19:21 | [diff] [blame] | 132 | if item.meta == 0 && item.vptr == nil { |
| 133 | // key not found |
| 134 | return false |
| 135 | } |
Manish R Jain | d979967 | 2017-09-01 04:19:21 | [diff] [blame] | 136 | return true |
| 137 | } |
| 138 | |
Janardhan Reddy | 8b89769 | 2018-03-13 05:23:53 | [diff] [blame] | 139 | // IsDeletedOrExpired returns true if item contains deleted or expired value. |
| 140 | func (item *Item) IsDeletedOrExpired() bool { |
| 141 | return isDeletedOrExpired(item.meta, item.expiresAt) |
| 142 | } |
| 143 | |
Ibrahim Jarif | 09e9b63 | 2019-04-15 10:43:20 | [diff] [blame] | 144 | // DiscardEarlierVersions returns whether the item was created with the |
Martin Martinez Rivera | b85f5ae | 2019-01-09 01:50:02 | [diff] [blame] | 145 | // option to discard earlier versions of a key when multiple are available. |
Manish R Jain | 79c98fc | 2018-05-05 01:56:43 | [diff] [blame] | 146 | func (item *Item) DiscardEarlierVersions() bool { |
| 147 | return item.meta&bitDiscardEarlierVersions > 0 |
| 148 | } |
| 149 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 150 | func (item *Item) yieldItemValue() ([]byte, func(), error) { |
Manish R Jain | e201d7b | 2018-06-04 18:50:41 | [diff] [blame] | 151 | key := item.Key() // No need to copy. |
Ibrahim Jarif | 0a5046f | 2020-09-28 16:31:33 | [diff] [blame] | 152 | if !item.hasValue() { |
| 153 | return nil, nil, nil |
Manish R Jain | 7af0076 | 2018-05-08 23:42:00 | [diff] [blame] | 154 | } |
Ibrahim Jarif | 0a5046f | 2020-09-28 16:31:33 | [diff] [blame] | 155 | |
| 156 | if item.slice == nil { |
| 157 | item.slice = new(y.Slice) |
| 158 | } |
| 159 | |
| 160 | if (item.meta & bitValuePointer) == 0 { |
| 161 | val := item.slice.Resize(len(item.vptr)) |
| 162 | copy(val, item.vptr) |
| 163 | return val, nil, nil |
| 164 | } |
| 165 | |
| 166 | var vp valuePointer |
| 167 | vp.Decode(item.vptr) |
| 168 | db := item.txn.db |
| 169 | result, cb, err := db.vlog.Read(vp, item.slice) |
| 170 | if err != nil { |
Paul Chesnais | da1dcac | 2023-07-17 22:41:18 | [diff] [blame] | 171 | db.opt.Errorf("Unable to read: Key: %v, Version : %v, meta: %v, userMeta: %v"+ |
Manish R Jain | e3a0d29 | 2020-10-07 01:41:41 | [diff] [blame] | 172 | " Error: %v", key, item.version, item.meta, item.userMeta, err) |
Manish R Jain | 3d225d7 | 2020-11-26 02:19:09 | [diff] [blame] | 173 | var txn *Txn |
| 174 | if db.opt.managedTxns { |
| 175 | txn = db.NewTransactionAt(math.MaxUint64, false) |
| 176 | } else { |
| 177 | txn = db.NewTransaction(false) |
| 178 | } |
Ibrahim Jarif | feb1f5f | 2020-11-25 13:15:52 | [diff] [blame] | 179 | defer txn.Discard() |
| 180 | |
| 181 | iopt := DefaultIteratorOptions |
| 182 | iopt.AllVersions = true |
| 183 | iopt.InternalAccess = true |
| 184 | iopt.PrefetchValues = false |
| 185 | |
| 186 | it := txn.NewKeyIterator(item.Key(), iopt) |
| 187 | defer it.Close() |
| 188 | for it.Rewind(); it.Valid(); it.Next() { |
| 189 | item := it.Item() |
| 190 | var vp valuePointer |
| 191 | if item.meta&bitValuePointer > 0 { |
| 192 | vp.Decode(item.vptr) |
| 193 | } |
Paul Chesnais | da1dcac | 2023-07-17 22:41:18 | [diff] [blame] | 194 | db.opt.Errorf("Key: %v, Version : %v, meta: %v, userMeta: %v valuePointer: %+v", |
Ibrahim Jarif | feb1f5f | 2020-11-25 13:15:52 | [diff] [blame] | 195 | item.Key(), item.version, item.meta, item.userMeta, vp) |
| 196 | } |
Ibrahim Jarif | 0a5046f | 2020-09-28 16:31:33 | [diff] [blame] | 197 | } |
Ibrahim Jarif | feb1f5f | 2020-11-25 13:15:52 | [diff] [blame] | 198 | // Don't return error if we cannot read the value. Just log the error. |
| 199 | return result, cb, nil |
Manish R Jain | 83aa09d | 2017-10-04 07:20:27 | [diff] [blame] | 200 | } |
| 201 | |
| 202 | func runCallback(cb func()) { |
| 203 | if cb != nil { |
| 204 | cb() |
| 205 | } |
| 206 | } |
| 207 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 208 | func (item *Item) prefetchValue() { |
Manish R Jain | 83aa09d | 2017-10-04 07:20:27 | [diff] [blame] | 209 | val, cb, err := item.yieldItemValue() |
| 210 | defer runCallback(cb) |
| 211 | |
| 212 | item.err = err |
| 213 | item.status = prefetched |
| 214 | if val == nil { |
| 215 | return |
| 216 | } |
Manish R Jain | e3a0d29 | 2020-10-07 01:41:41 | [diff] [blame] | 217 | buf := item.slice.Resize(len(val)) |
| 218 | copy(buf, val) |
| 219 | item.val = buf |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 220 | } |
| 221 | |
Bertram Truong | fc94c57 | 2018-10-08 17:57:38 | [diff] [blame] | 222 | // EstimatedSize returns the approximate size of the key-value pair. |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 223 | // |
| 224 | // This can be called while iterating through a store to quickly estimate the |
| 225 | // size of a range of key-value pairs (without fetching the corresponding |
| 226 | // values). |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 227 | func (item *Item) EstimatedSize() int64 { |
Manish R Jain | d979967 | 2017-09-01 04:19:21 | [diff] [blame] | 228 | if !item.hasValue() { |
| 229 | return 0 |
| 230 | } |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 231 | if (item.meta & bitValuePointer) == 0 { |
Manish R Jain | d979967 | 2017-09-01 04:19:21 | [diff] [blame] | 232 | return int64(len(item.key) + len(item.vptr)) |
| 233 | } |
| 234 | var vp valuePointer |
| 235 | vp.Decode(item.vptr) |
| 236 | return int64(vp.Len) // includes key length. |
| 237 | } |
| 238 | |
Ibrahim Jarif | fd59907 | 2019-03-07 20:08:30 | [diff] [blame] | 239 | // KeySize returns the size of the key. |
| 240 | // Exact size of the key is key + 8 bytes of timestamp |
| 241 | func (item *Item) KeySize() int64 { |
| 242 | return int64(len(item.key)) |
| 243 | } |
| 244 | |
Ibrahim Jarif | d8e1fcf | 2019-07-19 05:07:03 | [diff] [blame] | 245 | // ValueSize returns the approximate size of the value. |
Steven Allen | 5242a99 | 2018-10-04 21:40:52 | [diff] [blame] | 246 | // |
| 247 | // This can be called to quickly estimate the size of a value without fetching |
| 248 | // it. |
| 249 | func (item *Item) ValueSize() int64 { |
| 250 | if !item.hasValue() { |
| 251 | return 0 |
| 252 | } |
| 253 | if (item.meta & bitValuePointer) == 0 { |
| 254 | return int64(len(item.vptr)) |
| 255 | } |
| 256 | var vp valuePointer |
| 257 | vp.Decode(item.vptr) |
Manish R Jain | 5199564 | 2018-10-04 21:50:35 | [diff] [blame] | 258 | |
| 259 | klen := int64(len(item.key) + 8) // 8 bytes for timestamp. |
Ibrahim Jarif | d8e1fcf | 2019-07-19 05:07:03 | [diff] [blame] | 260 | // 6 bytes are for the approximate length of the header. Since header is encoded in varint, we |
| 261 | // cannot find the exact length of header without fetching it. |
| 262 | return int64(vp.Len) - klen - 6 - crc32.Size |
Steven Allen | 5242a99 | 2018-10-04 21:40:52 | [diff] [blame] | 263 | } |
| 264 | |
Manish R Jain | 730b6aa | 2017-08-31 08:07:35 | [diff] [blame] | 265 | // UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user |
| 266 | // is used to interpret the value. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 267 | func (item *Item) UserMeta() byte { |
Janardhan Reddy | 835ef24 | 2017-07-27 00:19:14 | [diff] [blame] | 268 | return item.userMeta |
| 269 | } |
| 270 | |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 271 | // ExpiresAt returns a Unix time value indicating when the item will be |
| 272 | // considered expired. 0 indicates that the item will never expire. |
| 273 | func (item *Item) ExpiresAt() uint64 { |
| 274 | return item.expiresAt |
| 275 | } |
| 276 | |
Manish R Jain | 83aa09d | 2017-10-04 07:20:27 | [diff] [blame] | 277 | // TODO: Switch this to use linked list container in Go. |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 278 | type list struct { |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 279 | head *Item |
| 280 | tail *Item |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 281 | } |
| 282 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 283 | func (l *list) push(i *Item) { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 284 | i.next = nil |
| 285 | if l.tail == nil { |
| 286 | l.head = i |
| 287 | l.tail = i |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 288 | return |
| 289 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 290 | l.tail.next = i |
| 291 | l.tail = i |
| 292 | } |
| 293 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 294 | func (l *list) pop() *Item { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 295 | if l.head == nil { |
| 296 | return nil |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 297 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 298 | i := l.head |
| 299 | if l.head == l.tail { |
| 300 | l.tail = nil |
| 301 | l.head = nil |
| 302 | } else { |
| 303 | l.head = i.next |
| 304 | } |
| 305 | i.next = nil |
| 306 | return i |
| 307 | } |
| 308 | |
Deepak Jois | 9bd2f1c | 2017-10-05 10:21:54 | [diff] [blame] | 309 | // IteratorOptions is used to set options when iterating over Badger key-value |
| 310 | // stores. |
| 311 | // |
| 312 | // This package provides DefaultIteratorOptions which contains options that |
| 313 | // should work for most applications. Consider using that as a starting point |
| 314 | // before customizing it for your own needs. |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 315 | type IteratorOptions struct { |
Ibrahim Jarif | cdf09c0 | 2020-09-20 17:19:40 | [diff] [blame] | 316 | // PrefetchSize is the number of KV pairs to prefetch while iterating. |
| 317 | // Valid only if PrefetchValues is true. |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 318 | PrefetchSize int |
Ibrahim Jarif | cdf09c0 | 2020-09-20 17:19:40 | [diff] [blame] | 319 | // PrefetchValues Indicates whether we should prefetch values during |
| 320 | // iteration and store them. |
| 321 | PrefetchValues bool |
| 322 | Reverse bool // Direction of iteration. False is forward, true is backward. |
| 323 | AllVersions bool // Fetch all valid versions of the same key. |
Ibrahim Jarif | fb2eed9 | 2019-06-12 08:15:53 | [diff] [blame] | 324 | InternalAccess bool // Used to allow internal access to badger keys. |
Ibrahim Jarif | cdf09c0 | 2020-09-20 17:19:40 | [diff] [blame] | 325 | |
| 326 | // The following option is used to narrow down the SSTables that iterator |
| 327 | // picks up. If Prefix is specified, only tables which could have this |
| 328 | // prefix are picked based on their range of keys. |
| 329 | prefixIsKey bool // If set, use the prefix for bloom filter lookup. |
| 330 | Prefix []byte // Only iterate over this given prefix. |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 331 | SinceTs uint64 // Only read data that has version > SinceTs. |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 332 | } |
| 333 | |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 334 | func (opt *IteratorOptions) compareToPrefix(key []byte) int { |
| 335 | // We should compare key without timestamp. For example key - a[TS] might be > "aa" prefix. |
| 336 | key = y.ParseKey(key) |
| 337 | if len(key) > len(opt.Prefix) { |
| 338 | key = key[:len(opt.Prefix)] |
| 339 | } |
| 340 | return bytes.Compare(key, opt.Prefix) |
| 341 | } |
| 342 | |
Martin Martinez Rivera | b85f5ae | 2019-01-09 01:50:02 | [diff] [blame] | 343 | func (opt *IteratorOptions) pickTable(t table.TableInterface) bool { |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 344 | // Ignore this table if its max version is less than the sinceTs. |
| 345 | if t.MaxVersion() < opt.SinceTs { |
| 346 | return false |
| 347 | } |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 348 | if len(opt.Prefix) == 0 { |
| 349 | return true |
| 350 | } |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 351 | if opt.compareToPrefix(t.Smallest()) > 0 { |
Manish R Jain | 49a49e3 | 2018-11-26 21:07:12 | [diff] [blame] | 352 | return false |
| 353 | } |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 354 | if opt.compareToPrefix(t.Biggest()) < 0 { |
Manish R Jain | 49a49e3 | 2018-11-26 21:07:12 | [diff] [blame] | 355 | return false |
| 356 | } |
| 357 | // Bloom filter lookup would only work if opt.Prefix does NOT have the read |
| 358 | // timestamp as part of the key. |
Ibrahim Jarif | 599363b | 2020-10-03 16:34:58 | [diff] [blame] | 359 | if opt.prefixIsKey && t.DoesNotHave(y.Hash(opt.Prefix)) { |
Manish R Jain | 49a49e3 | 2018-11-26 21:07:12 | [diff] [blame] | 360 | return false |
| 361 | } |
| 362 | return true |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 363 | } |
| 364 | |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 365 | // pickTables picks the necessary table for the iterator. This function also assumes |
| 366 | // that the tables are sorted in the right order. |
| 367 | func (opt *IteratorOptions) pickTables(all []*table.Table) []*table.Table { |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 368 | filterTables := func(tables []*table.Table) []*table.Table { |
Aman Mangal | ffd74f3 | 2023-02-14 15:51:41 | [diff] [blame^] | 369 | if opt.SinceTs == 0 { |
| 370 | return tables |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 371 | } |
Aman Mangal | ffd74f3 | 2023-02-14 15:51:41 | [diff] [blame^] | 372 | out := tables[:0] |
| 373 | for _, t := range tables { |
| 374 | if t.MaxVersion() < opt.SinceTs { |
| 375 | continue |
| 376 | } |
| 377 | out = append(out, t) |
| 378 | } |
| 379 | return out |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 380 | } |
| 381 | |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 382 | if len(opt.Prefix) == 0 { |
| 383 | out := make([]*table.Table, len(all)) |
| 384 | copy(out, all) |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 385 | return filterTables(out) |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 386 | } |
| 387 | sIdx := sort.Search(len(all), func(i int) bool { |
Manish R Jain | 74f2e02 | 2020-12-03 23:30:11 | [diff] [blame] | 388 | // table.Biggest >= opt.prefix |
| 389 | // if opt.Prefix < table.Biggest, then surely it is not in any of the preceding tables. |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 390 | return opt.compareToPrefix(all[i].Biggest()) >= 0 |
| 391 | }) |
| 392 | if sIdx == len(all) { |
| 393 | // Not found. |
| 394 | return []*table.Table{} |
| 395 | } |
| 396 | |
| 397 | filtered := all[sIdx:] |
| 398 | if !opt.prefixIsKey { |
| 399 | eIdx := sort.Search(len(filtered), func(i int) bool { |
| 400 | return opt.compareToPrefix(filtered[i].Smallest()) > 0 |
| 401 | }) |
| 402 | out := make([]*table.Table, len(filtered[:eIdx])) |
| 403 | copy(out, filtered[:eIdx]) |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 404 | return filterTables(out) |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 405 | } |
| 406 | |
Manish R Jain | 74f2e02 | 2020-12-03 23:30:11 | [diff] [blame] | 407 | // opt.prefixIsKey == true. This code is optimizing for opt.prefixIsKey part. |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 408 | var out []*table.Table |
Manish R Jain | 74f2e02 | 2020-12-03 23:30:11 | [diff] [blame] | 409 | hash := y.Hash(opt.Prefix) |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 410 | for _, t := range filtered { |
Manish R Jain | 74f2e02 | 2020-12-03 23:30:11 | [diff] [blame] | 411 | // When we encounter the first table whose smallest key is higher than opt.Prefix, we can |
| 412 | // stop. This is an IMPORTANT optimization, just considering how often we call |
| 413 | // NewKeyIterator. |
| 414 | if opt.compareToPrefix(t.Smallest()) > 0 { |
| 415 | // if table.Smallest > opt.Prefix, then this and all tables after this can be ignored. |
| 416 | break |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 417 | } |
Manish R Jain | 74f2e02 | 2020-12-03 23:30:11 | [diff] [blame] | 418 | // opt.Prefix is actually the key. So, we can run bloom filter checks |
| 419 | // as well. |
| 420 | if t.DoesNotHave(hash) { |
| 421 | continue |
| 422 | } |
| 423 | out = append(out, t) |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 424 | } |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 425 | return filterTables(out) |
balaji | 2090032 | 2019-08-23 11:12:40 | [diff] [blame] | 426 | } |
| 427 | |
Deepak Jois | 13e687b | 2017-08-29 10:28:52 | [diff] [blame] | 428 | // DefaultIteratorOptions contains default options when iterating over Badger key-value stores. |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 429 | var DefaultIteratorOptions = IteratorOptions{ |
Deepak Jois | 2663825 | 2017-09-11 09:02:29 | [diff] [blame] | 430 | PrefetchValues: true, |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 431 | PrefetchSize: 100, |
| 432 | Reverse: false, |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 433 | AllVersions: false, |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 434 | } |
| 435 | |
| 436 | // Iterator helps iterating over the KV pairs in a lexicographically sorted order. |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 437 | type Iterator struct { |
Ibrahim Jarif | 73ea6e6 | 2019-10-21 10:33:21 | [diff] [blame] | 438 | iitr y.Iterator |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 439 | txn *Txn |
| 440 | readTs uint64 |
Manish R Jain | 4ab1548 | 2017-04-28 09:38:01 | [diff] [blame] | 441 | |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 442 | opt IteratorOptions |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 443 | item *Item |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 444 | data list |
| 445 | waste list |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 446 | |
| 447 | lastKey []byte // Used to skip over multiple versions of the same key. |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 448 | |
Naman Jain | 38eb5a1 | 2021-01-07 13:18:00 | [diff] [blame] | 449 | closed bool |
| 450 | scanned int // Used to estimate the size of data scanned by iterator. |
Martin Martinez Rivera | 6eaa500 | 2020-05-22 17:45:26 | [diff] [blame] | 451 | |
| 452 | // ThreadId is an optional value that can be set to identify which goroutine created |
| 453 | // the iterator. It can be used, for example, to uniquely identify each of the |
| 454 | // iterators created by the stream interface |
| 455 | ThreadId int |
Manish R Jain | 70088c6 | 2020-12-03 03:30:43 | [diff] [blame] | 456 | |
Manish R Jain | b80c792 | 2020-12-04 21:38:44 | [diff] [blame] | 457 | Alloc *z.Allocator |
Manish R Jain | 4ab1548 | 2017-04-28 09:38:01 | [diff] [blame] | 458 | } |
| 459 | |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 460 | // NewIterator returns a new iterator. Depending upon the options, either only keys, or both |
| 461 | // key-value pairs would be fetched. The keys are returned in lexicographically sorted order. |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 462 | // Using prefetch is recommended if you're doing a long running iteration, for performance. |
| 463 | // |
| 464 | // Multiple Iterators: |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 465 | // For a read-only txn, multiple iterators can be running simultaneously. However, for a read-write |
Elliot Courant | af22dfd | 2020-05-13 06:07:33 | [diff] [blame] | 466 | // txn, iterators have the nuance of being a snapshot of the writes for the transaction at the time |
| 467 | // iterator was created. If writes are performed after an iterator is created, then that iterator |
| 468 | // will not be able to see those writes. Only writes performed before an iterator was created can be |
| 469 | // viewed. |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 470 | func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 471 | if txn.discarded { |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 472 | panic(ErrDiscardedTxn) |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 473 | } |
Ibrahim Jarif | 1e21a94 | 2020-08-26 10:03:36 | [diff] [blame] | 474 | if txn.db.IsClosed() { |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 475 | panic(ErrDBClosed) |
Ibrahim Jarif | 1e21a94 | 2020-08-26 10:03:36 | [diff] [blame] | 476 | } |
Elliot Courant | af22dfd | 2020-05-13 06:07:33 | [diff] [blame] | 477 | |
Harshil Goel | ec80d3d | 2023-07-18 05:54:55 | [diff] [blame] | 478 | y.NumIteratorsCreatedAdd(txn.db.opt.MetricsEnabled, 1) |
| 479 | |
Elliot Courant | af22dfd | 2020-05-13 06:07:33 | [diff] [blame] | 480 | // Keep track of the number of active iterators. |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 481 | txn.numIterators.Add(1) |
Manish R Jain | b1ad1e9 | 2018-06-19 00:07:00 | [diff] [blame] | 482 | |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 483 | // TODO: If Prefix is set, only pick those memtables which have keys with the prefix. |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 484 | tables, decr := txn.db.getMemTables() |
| 485 | defer decr() |
| 486 | txn.db.vlog.incrIteratorCount() |
| 487 | var iters []y.Iterator |
Janardhan Reddy | 097bd7a | 2017-11-28 01:20:36 | [diff] [blame] | 488 | if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil { |
| 489 | iters = append(iters, itr) |
| 490 | } |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 491 | for i := 0; i < len(tables); i++ { |
Manish R Jain | e3a0d29 | 2020-10-07 01:41:41 | [diff] [blame] | 492 | iters = append(iters, tables[i].sl.NewUniIterator(opt.Reverse)) |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 493 | } |
Aman Mangal | ffd74f3 | 2023-02-14 15:51:41 | [diff] [blame^] | 494 | iters = append(iters, txn.db.lc.iterators(&opt)...) // This will increment references. |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 495 | res := &Iterator{ |
| 496 | txn: txn, |
Ibrahim Jarif | 73ea6e6 | 2019-10-21 10:33:21 | [diff] [blame] | 497 | iitr: table.NewMergeIterator(iters, opt.Reverse), |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 498 | opt: opt, |
| 499 | readTs: txn.readTs, |
| 500 | } |
| 501 | return res |
| 502 | } |
| 503 | |
Manish R Jain | 49a49e3 | 2018-11-26 21:07:12 | [diff] [blame] | 504 | // NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a |
| 505 | // single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to |
| 506 | // additionally run bloom filter lookups before picking tables from the LSM tree. |
| 507 | func (txn *Txn) NewKeyIterator(key []byte, opt IteratorOptions) *Iterator { |
| 508 | if len(opt.Prefix) > 0 { |
| 509 | panic("opt.Prefix should be nil for NewKeyIterator.") |
| 510 | } |
| 511 | opt.Prefix = key // This key must be without the timestamp. |
| 512 | opt.prefixIsKey = true |
Ibrahim Jarif | a11c5e0 | 2019-08-01 13:53:46 | [diff] [blame] | 513 | opt.AllVersions = true |
Manish R Jain | 49a49e3 | 2018-11-26 21:07:12 | [diff] [blame] | 514 | return txn.NewIterator(opt) |
| 515 | } |
| 516 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 517 | func (it *Iterator) newItem() *Item { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 518 | item := it.waste.pop() |
| 519 | if item == nil { |
Ibrahim Jarif | cdf09c0 | 2020-09-20 17:19:40 | [diff] [blame] | 520 | item = &Item{slice: new(y.Slice), txn: it.txn} |
Manish R Jain | 4ab1548 | 2017-04-28 09:38:01 | [diff] [blame] | 521 | } |
Manish R Jain | 4ab1548 | 2017-04-28 09:38:01 | [diff] [blame] | 522 | return item |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 523 | } |
| 524 | |
Manish R Jain | 1d625f4 | 2017-10-05 00:47:56 | [diff] [blame] | 525 | // Item returns pointer to the current key-value pair. |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 526 | // This item is only valid until it.Next() gets called. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 527 | func (it *Iterator) Item() *Item { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 528 | tx := it.txn |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 529 | tx.addReadKey(it.item.Key()) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 530 | return it.item |
| 531 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 532 | |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 533 | // Valid returns false when iteration is done. |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 534 | func (it *Iterator) Valid() bool { |
| 535 | if it.item == nil { |
| 536 | return false |
| 537 | } |
Ibrahim Jarif | a11c5e0 | 2019-08-01 13:53:46 | [diff] [blame] | 538 | if it.opt.prefixIsKey { |
| 539 | return bytes.Equal(it.item.key, it.opt.Prefix) |
| 540 | } |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 541 | return bytes.HasPrefix(it.item.key, it.opt.Prefix) |
| 542 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 543 | |
Salim Alami | eaad7eb | 2017-06-28 01:16:34 | [diff] [blame] | 544 | // ValidForPrefix returns false when iteration is done |
| 545 | // or when the current key is not prefixed by the specified prefix. |
| 546 | func (it *Iterator) ValidForPrefix(prefix []byte) bool { |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 547 | return it.Valid() && bytes.HasPrefix(it.item.key, prefix) |
Salim Alami | eaad7eb | 2017-06-28 01:16:34 | [diff] [blame] | 548 | } |
| 549 | |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 550 | // Close would close the iterator. It is important to call this when you're done with iteration. |
Manish R Jain | 41b1cd2 | 2017-05-03 01:12:24 | [diff] [blame] | 551 | func (it *Iterator) Close() { |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 552 | if it.closed { |
| 553 | return |
| 554 | } |
| 555 | it.closed = true |
Naman Jain | 195b212 | 2021-02-22 15:59:19 | [diff] [blame] | 556 | if it.iitr == nil { |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 557 | it.txn.numIterators.Add(-1) |
Naman Jain | 195b212 | 2021-02-22 15:59:19 | [diff] [blame] | 558 | return |
| 559 | } |
Manish R Jain | 8d041dd | 2018-02-27 00:19:34 | [diff] [blame] | 560 | |
Manish R Jain | 41d9656 | 2018-09-25 19:40:45 | [diff] [blame] | 561 | it.iitr.Close() |
Manish R Jain | 8d041dd | 2018-02-27 00:19:34 | [diff] [blame] | 562 | // It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie |
| 563 | // goroutines behind, which are waiting to acquire file read locks after DB has been closed. |
| 564 | waitFor := func(l list) { |
| 565 | item := l.pop() |
| 566 | for item != nil { |
| 567 | item.wg.Wait() |
| 568 | item = l.pop() |
| 569 | } |
| 570 | } |
| 571 | waitFor(it.waste) |
| 572 | waitFor(it.data) |
| 573 | |
Sam Hughes | 9ed12b9 | 2017-09-14 05:15:36 | [diff] [blame] | 574 | // TODO: We could handle this error. |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 575 | _ = it.txn.db.vlog.decrIteratorCount() |
Aman Mangal | b1ea360 | 2023-02-22 17:36:03 | [diff] [blame] | 576 | it.txn.numIterators.Add(-1) |
Manish R Jain | 41b1cd2 | 2017-05-03 01:12:24 | [diff] [blame] | 577 | } |
| 578 | |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 579 | // Next would advance the iterator by one. Always check it.Valid() after a Next() |
| 580 | // to ensure you have access to a valid it.Item(). |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 581 | func (it *Iterator) Next() { |
Naman Jain | 195b212 | 2021-02-22 15:59:19 | [diff] [blame] | 582 | if it.iitr == nil { |
| 583 | return |
| 584 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 585 | // Reuse current item |
| 586 | it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting. |
Naman Jain | 38eb5a1 | 2021-01-07 13:18:00 | [diff] [blame] | 587 | it.scanned += len(it.item.key) + len(it.item.val) + len(it.item.vptr) + 2 |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 588 | it.waste.push(it.item) |
| 589 | |
| 590 | // Set next item to current |
| 591 | it.item = it.data.pop() |
Harshil Goel | b77f2e8 | 2024-09-12 07:12:55 | [diff] [blame] | 592 | for it.iitr.Valid() && hasPrefix(it) { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 593 | if it.parseItem() { |
| 594 | // parseItem calls one extra next. |
| 595 | // This is used to deal with the complexity of reverse iteration. |
Szymon | 921c153 | 2017-05-19 10:21:50 | [diff] [blame] | 596 | break |
| 597 | } |
Manish R Jain | 4ab1548 | 2017-04-28 09:38:01 | [diff] [blame] | 598 | } |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 599 | } |
Manish R Jain | 306b99d | 2017-05-29 04:27:42 | [diff] [blame] | 600 | |
Janardhan Reddy | a057627 | 2017-12-19 07:26:36 | [diff] [blame] | 601 | func isDeletedOrExpired(meta byte, expiresAt uint64) bool { |
| 602 | if meta&bitDelete > 0 { |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 603 | return true |
| 604 | } |
Janardhan Reddy | a057627 | 2017-12-19 07:26:36 | [diff] [blame] | 605 | if expiresAt == 0 { |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 606 | return false |
| 607 | } |
Janardhan Reddy | a057627 | 2017-12-19 07:26:36 | [diff] [blame] | 608 | return expiresAt <= uint64(time.Now().Unix()) |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 609 | } |
| 610 | |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 611 | // parseItem is a complex function because it needs to handle both forward and reverse iteration |
| 612 | // implementation. We store keys such that their versions are sorted in descending order. This makes |
| 613 | // forward iteration efficient, but revese iteration complicated. This tradeoff is better because |
Naman Jain | 3e5ab7c | 2021-01-25 09:27:29 | [diff] [blame] | 614 | // forward iteration is more common than reverse. It returns true, if either the iterator is invalid |
| 615 | // or it has pushed an item into it.data list, else it returns false. |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 616 | // |
| 617 | // This function advances the iterator. |
| 618 | func (it *Iterator) parseItem() bool { |
| 619 | mi := it.iitr |
| 620 | key := mi.Key() |
| 621 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 622 | setItem := func(item *Item) { |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 623 | if it.item == nil { |
| 624 | it.item = item |
| 625 | } else { |
| 626 | it.data.push(item) |
| 627 | } |
| 628 | } |
| 629 | |
Naman Jain | 3e5ab7c | 2021-01-25 09:27:29 | [diff] [blame] | 630 | isInternalKey := bytes.HasPrefix(key, badgerPrefix) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 631 | // Skip badger keys. |
Naman Jain | 3e5ab7c | 2021-01-25 09:27:29 | [diff] [blame] | 632 | if !it.opt.InternalAccess && isInternalKey { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 633 | mi.Next() |
| 634 | return false |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 635 | } |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 636 | |
| 637 | // Skip any versions which are beyond the readTs. |
| 638 | version := y.ParseTs(key) |
Ibrahim Jarif | 31c061e | 2021-02-05 12:49:34 | [diff] [blame] | 639 | // Ignore everything that is above the readTs and below or at the sinceTs. |
Naman Jain | e1f9dce | 2021-07-08 08:14:04 | [diff] [blame] | 640 | if version > it.readTs || (it.opt.SinceTs > 0 && version <= it.opt.SinceTs) { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 641 | mi.Next() |
| 642 | return false |
| 643 | } |
| 644 | |
Naman Jain | 3e5ab7c | 2021-01-25 09:27:29 | [diff] [blame] | 645 | // Skip banned keys only if it does not have badger internal prefix. |
| 646 | if !isInternalKey && it.txn.db.isBanned(key) != nil { |
| 647 | mi.Next() |
| 648 | return false |
| 649 | } |
| 650 | |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 651 | if it.opt.AllVersions { |
Janardhan Reddy | a057627 | 2017-12-19 07:26:36 | [diff] [blame] | 652 | // Return deleted or expired values also, otherwise user can't figure out |
| 653 | // whether the key was deleted. |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 654 | item := it.newItem() |
| 655 | it.fill(item) |
| 656 | setItem(item) |
| 657 | mi.Next() |
| 658 | return true |
| 659 | } |
| 660 | |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 661 | // If iterating in forward direction, then just checking the last key against current key would |
| 662 | // be sufficient. |
| 663 | if !it.opt.Reverse { |
| 664 | if y.SameKey(it.lastKey, key) { |
| 665 | mi.Next() |
| 666 | return false |
| 667 | } |
| 668 | // Only track in forward direction. |
| 669 | // We should update lastKey as soon as we find a different key in our snapshot. |
| 670 | // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. |
| 671 | // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, |
| 672 | // which is wrong. Therefore, update lastKey here. |
Manish R Jain | b3568eb | 2017-11-13 03:04:41 | [diff] [blame] | 673 | it.lastKey = y.SafeCopy(it.lastKey, mi.Key()) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 674 | } |
| 675 | |
| 676 | FILL: |
| 677 | // If deleted, advance and return. |
Janardhan Reddy | a057627 | 2017-12-19 07:26:36 | [diff] [blame] | 678 | vs := mi.Value() |
| 679 | if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 680 | mi.Next() |
| 681 | return false |
| 682 | } |
| 683 | |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 684 | item := it.newItem() |
| 685 | it.fill(item) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 686 | // fill item based on current cursor position. All Next calls have returned, so reaching here |
| 687 | // means no Next was called. |
| 688 | |
| 689 | mi.Next() // Advance but no fill item yet. |
| 690 | if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid. |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 691 | setItem(item) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 692 | return true |
| 693 | } |
| 694 | |
| 695 | // Reverse direction. |
| 696 | nextTs := y.ParseTs(mi.Key()) |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 697 | mik := y.ParseKey(mi.Key()) |
Hiroaki Nakamura | c6414b1 | 2017-10-14 10:31:40 | [diff] [blame] | 698 | if nextTs <= it.readTs && bytes.Equal(mik, item.key) { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 699 | // This is a valid potential candidate. |
| 700 | goto FILL |
| 701 | } |
| 702 | // Ignore the next candidate. Return the current one. |
Manish R Jain | 90be387 | 2017-10-02 08:22:55 | [diff] [blame] | 703 | setItem(item) |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 704 | return true |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 705 | } |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 706 | |
Manish R Jain | abaad90 | 2017-10-04 10:55:56 | [diff] [blame] | 707 | func (it *Iterator) fill(item *Item) { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 708 | vs := it.iitr.Value() |
| 709 | item.meta = vs.Meta |
Janardhan Reddy | 835ef24 | 2017-07-27 00:19:14 | [diff] [blame] | 710 | item.userMeta = vs.UserMeta |
Deepak Jois | a5499e5 | 2017-11-02 02:03:14 | [diff] [blame] | 711 | item.expiresAt = vs.ExpiresAt |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 712 | |
| 713 | item.version = y.ParseTs(it.iitr.Key()) |
Manish R Jain | b3568eb | 2017-11-13 03:04:41 | [diff] [blame] | 714 | item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key())) |
Janardhan Reddy | ffaaa66 | 2017-10-02 06:47:46 | [diff] [blame] | 715 | |
Manish R Jain | b3568eb | 2017-11-13 03:04:41 | [diff] [blame] | 716 | item.vptr = y.SafeCopy(item.vptr, vs.Value) |
Deepak Jois | 71abccc | 2017-08-23 10:26:26 | [diff] [blame] | 717 | item.val = nil |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 718 | if it.opt.PrefetchValues { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 719 | item.wg.Add(1) |
Manish R Jain | 55c350d | 2017-05-30 10:04:50 | [diff] [blame] | 720 | go func() { |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 721 | // FIXME we are not handling errors here. |
| 722 | item.prefetchValue() |
Manish R Jain | 55c350d | 2017-05-30 10:04:50 | [diff] [blame] | 723 | item.wg.Done() |
| 724 | }() |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 725 | } |
| 726 | } |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 727 | |
Harshil Goel | b77f2e8 | 2024-09-12 07:12:55 | [diff] [blame] | 728 | func hasPrefix(it *Iterator) bool { |
| 729 | // We shouldn't check prefix in case the iterator is going in reverse. Since in reverse we expect |
| 730 | // people to append items to the end of prefix. |
| 731 | if !it.opt.Reverse && len(it.opt.Prefix) > 0 { |
| 732 | return bytes.HasPrefix(y.ParseKey(it.iitr.Key()), it.opt.Prefix) |
Harshil Goel | 2c148fe | 2024-08-14 12:35:42 | [diff] [blame] | 733 | } |
| 734 | return true |
| 735 | } |
| 736 | |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 737 | func (it *Iterator) prefetch() { |
Deepak Jois | b9aae1b | 2017-08-31 05:06:49 | [diff] [blame] | 738 | prefetchSize := 2 |
| 739 | if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 { |
| 740 | prefetchSize = it.opt.PrefetchSize |
Pawan Rawal | 748aadf | 2017-08-18 07:22:24 | [diff] [blame] | 741 | } |
| 742 | |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 743 | i := it.iitr |
| 744 | var count int |
| 745 | it.item = nil |
Harshil Goel | b77f2e8 | 2024-09-12 07:12:55 | [diff] [blame] | 746 | for i.Valid() && hasPrefix(it) { |
Manish R Jain | 50a2e6d | 2017-09-28 00:33:28 | [diff] [blame] | 747 | if !it.parseItem() { |
Szymon | 921c153 | 2017-05-19 10:21:50 | [diff] [blame] | 748 | continue |
| 749 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 750 | count++ |
Pawan Rawal | 748aadf | 2017-08-18 07:22:24 | [diff] [blame] | 751 | if count == prefetchSize { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 752 | break |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 753 | } |
| 754 | } |
| 755 | } |
| 756 | |
Martin Martinez Rivera | 3e0e35e | 2019-06-07 17:13:10 | [diff] [blame] | 757 | // Seek would seek to the provided key if present. If absent, it would seek to the next |
| 758 | // smallest key greater than the provided key if iterating in the forward direction. |
| 759 | // Behavior would be reversed if iterating backwards. |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 760 | func (it *Iterator) Seek(key []byte) { |
Naman Jain | 195b212 | 2021-02-22 15:59:19 | [diff] [blame] | 761 | if it.iitr == nil { |
| 762 | return |
| 763 | } |
Ibrahim Jarif | da80eb9 | 2020-06-01 10:28:18 | [diff] [blame] | 764 | if len(key) > 0 { |
| 765 | it.txn.addReadKey(key) |
| 766 | } |
Pawan Rawal | b50cd8f | 2017-05-20 10:32:58 | [diff] [blame] | 767 | for i := it.data.pop(); i != nil; i = it.data.pop() { |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 768 | i.wg.Wait() |
| 769 | it.waste.push(i) |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 770 | } |
Janardhan Reddy | 9b31d1a | 2017-10-03 23:53:02 | [diff] [blame] | 771 | |
| 772 | it.lastKey = it.lastKey[:0] |
| 773 | if len(key) == 0 { |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 774 | key = it.opt.Prefix |
| 775 | } |
| 776 | if len(key) == 0 { |
Janardhan Reddy | 9b31d1a | 2017-10-03 23:53:02 | [diff] [blame] | 777 | it.iitr.Rewind() |
| 778 | it.prefetch() |
| 779 | return |
| 780 | } |
| 781 | |
| 782 | if !it.opt.Reverse { |
| 783 | key = y.KeyWithTs(key, it.txn.readTs) |
| 784 | } else { |
| 785 | key = y.KeyWithTs(key, 0) |
| 786 | } |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 787 | it.iitr.Seek(key) |
| 788 | it.prefetch() |
Manish R Jain | 6911fde | 2017-04-16 06:20:16 | [diff] [blame] | 789 | } |
| 790 | |
Manish R Jain | 8315136 | 2017-05-11 01:50:48 | [diff] [blame] | 791 | // Rewind would rewind the iterator cursor all the way to zero-th position, which would be the |
| 792 | // smallest key if iterating forward, and largest if iterating backward. It does not keep track of |
| 793 | // whether the cursor started with a Seek(). |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 794 | func (it *Iterator) Rewind() { |
Manish R Jain | 7d46029 | 2018-11-25 19:53:03 | [diff] [blame] | 795 | it.Seek(nil) |
Manish R Jain | 0f9030a | 2017-05-02 03:40:47 | [diff] [blame] | 796 | } |