-
Notifications
You must be signed in to change notification settings - Fork 242
Expand file tree
/
Copy pathobjects.c
More file actions
935 lines (817 loc) · 28 KB
/
objects.c
File metadata and controls
935 lines (817 loc) · 28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "child.h"
#include "debug.h"
#include "fd.h"
#include "list.h"
#include "locks.h"
#include "objects.h"
#include "params.h"
#include "pids.h"
#include "random.h"
#include "shm.h"
#include "trinity.h"
#include "utils.h"
static struct list_head global_obj_list = { &global_obj_list, &global_obj_list };
void register_global_obj_init(struct global_obj_entry *entry)
{
list_add_tail((struct list_head *) &entry->list, &global_obj_list);
}
void init_global_objects(void)
{
struct list_head *pos;
list_for_each(pos, &global_obj_list) {
struct global_obj_entry *entry = (struct global_obj_entry *) pos;
output(0, "Initializing %s objects.\n", entry->name);
entry->init();
}
}
/*
* Walk every global obj list and invoke the existing list validator on
* each entry. __list_del_entry_valid_or_die() emits the standard
* "back-link broken" / "entry was zeroed" / "use-after-list_del"
* diagnostics and __BUGs on the first inconsistency, so a corruption
* gets pinned to the next idle pass instead of waiting for the next
* unrelated list_add or list_del to crash.
*
* Read-only walk: the obj heap is mprotected PROT_READ post-freeze,
* which still permits the loads we need. The lists themselves are
* mutated only from the parent (under shm->objlock); main_loop is the
* only parent thread, so a same-process walk does not race against a
* concurrent mutator.
*/
void validate_global_object_lists(void)
{
unsigned int i;
for (i = 0; i < MAX_OBJECT_TYPES; i++) {
struct objhead *head = &shm->global_objects[i];
struct list_head *list = head->list;
struct list_head *pos;
if (list == NULL)
continue;
/* Head links must point somewhere — a NULL head means the
* list head itself was zeroed by a stray write. */
if (list->next == NULL || list->prev == NULL) {
outputerr("validate_global_object_lists: type %u: "
"head=%p has NULL link (next=%p prev=%p)\n",
i, list, list->next, list->prev);
__BUG("global list head corrupted",
__FILE__, __func__, __LINE__);
}
/* Walk and revalidate every entry. This catches the case
* the existing per-mutation validator can't: corruption that
* happened *between* mutations (a wild write while no
* list_add/list_del was in flight). */
for (pos = list->next; pos != list; pos = pos->next) {
__list_del_entry_valid_or_die(pos, __FILE__,
__func__, __LINE__);
}
}
}
/*
* Hash table mapping fd → (object, type) for O(1) lookup in
* remove_object_by_fd(). Open-addressing with linear probing.
*
* The table itself lives in shm (shm->fd_hash) so children can read
* the per-slot generation counter the parent updates on every fd-table
* mutation. Mutations happen under shm->objlock; child reads of the
* gen field are unlocked and use ACQUIRE semantics.
*/
void fd_hash_init(void)
{
unsigned int i;
for (i = 0; i < FD_HASH_SIZE; i++) {
shm->fd_hash[i].fd = -1;
shm->fd_hash[i].gen = 0;
}
shm->fd_hash_count = 0;
}
static unsigned int fd_hash_slot(int fd)
{
return (unsigned int) fd & (FD_HASH_SIZE - 1);
}
/*
* Internal insert that preserves the entry's existing generation and
* doesn't update fd_hash_count. Used by fd_hash_remove to re-hash
* displaced entries: the entry's identity is unchanged, only its slot,
* so any cached gen on a child must continue to match.
*/
static void fd_hash_reinsert(int fd, struct object *obj, enum objecttype type,
uint32_t gen)
{
unsigned int slot;
unsigned int probe;
slot = fd_hash_slot(fd);
for (probe = 0; probe < FD_HASH_SIZE; probe++) {
if (shm->fd_hash[slot].fd == -1)
break;
slot = (slot + 1) & (FD_HASH_SIZE - 1);
}
if (probe == FD_HASH_SIZE)
return;
shm->fd_hash[slot].obj = obj;
shm->fd_hash[slot].type = type;
__atomic_store_n(&shm->fd_hash[slot].gen, gen, __ATOMIC_RELEASE);
__atomic_store_n(&shm->fd_hash[slot].fd, fd, __ATOMIC_RELEASE);
}
bool fd_hash_insert(int fd, struct object *obj, enum objecttype type)
{
unsigned int slot;
uint32_t gen;
if (fd < 0)
return true;
if (shm->fd_hash_count >= FD_HASH_SIZE)
return false;
slot = fd_hash_slot(fd);
while (shm->fd_hash[slot].fd != -1 && shm->fd_hash[slot].fd != fd)
slot = (slot + 1) & (FD_HASH_SIZE - 1);
if (shm->fd_hash[slot].fd == -1)
shm->fd_hash_count++;
shm->fd_hash[slot].obj = obj;
shm->fd_hash[slot].type = type;
/*
* Bump the slot's generation so any child that cached the
* previous occupant's (or absence) gen sees a mismatch. The
* RELEASE-store on fd publishes the entry — children using
* ACQUIRE-load on fd see the updated gen too.
*/
gen = shm->fd_hash[slot].gen + 1;
__atomic_store_n(&shm->fd_hash[slot].gen, gen, __ATOMIC_RELEASE);
__atomic_store_n(&shm->fd_hash[slot].fd, fd, __ATOMIC_RELEASE);
return true;
}
void fd_hash_remove(int fd)
{
unsigned int slot, next, i;
if (fd < 0)
return;
slot = fd_hash_slot(fd);
for (i = 0; i < FD_HASH_SIZE; i++) {
if (shm->fd_hash[slot].fd == -1)
return;
if (shm->fd_hash[slot].fd == fd) {
uint32_t gen;
/*
* Mark the slot empty and bump its generation so a
* child that cached this fd's gen sees a mismatch
* even before any replacement is inserted here.
*/
gen = shm->fd_hash[slot].gen + 1;
__atomic_store_n(&shm->fd_hash[slot].gen, gen,
__ATOMIC_RELEASE);
__atomic_store_n(&shm->fd_hash[slot].fd, -1,
__ATOMIC_RELEASE);
shm->fd_hash_count--;
next = (slot + 1) & (FD_HASH_SIZE - 1);
while (shm->fd_hash[next].fd != -1) {
struct fd_hash_entry displaced = shm->fd_hash[next];
__atomic_store_n(&shm->fd_hash[next].fd, -1,
__ATOMIC_RELEASE);
fd_hash_reinsert(displaced.fd, displaced.obj,
displaced.type, displaced.gen);
next = (next + 1) & (FD_HASH_SIZE - 1);
}
return;
}
slot = (slot + 1) & (FD_HASH_SIZE - 1);
}
}
struct fd_hash_entry *fd_hash_lookup(int fd)
{
unsigned int slot, i;
if (fd < 0)
return NULL;
slot = fd_hash_slot(fd);
for (i = 0; i < FD_HASH_SIZE; i++) {
int slot_fd = __atomic_load_n(&shm->fd_hash[slot].fd, __ATOMIC_ACQUIRE);
if (slot_fd == -1)
return NULL;
if (slot_fd == fd)
return &shm->fd_hash[slot];
slot = (slot + 1) & (FD_HASH_SIZE - 1);
}
return NULL;
}
static bool is_fd_type(enum objecttype type)
{
return type >= OBJ_FD_PIPE && type <= OBJ_FD_FS_CTX;
}
struct object * alloc_object(void)
{
struct object *obj;
obj = zmalloc(sizeof(struct object));
INIT_LIST_HEAD(&obj->list);
return obj;
}
/*
* Release an obj struct via the right deallocator for its (scope, type).
* OBJ_GLOBAL types that opted into the shared obj heap (shared_alloc=true,
* set by the type's init function) came from alloc_shared_obj() and must
* be returned via free_shared_obj() — calling free() on a pointer into
* the shared heap would hand a non-malloc'd address to glibc. All other
* callers used zmalloc() and want plain free().
*/
static void release_obj(struct object *obj, enum obj_scope scope,
enum objecttype type)
{
if (scope == OBJ_GLOBAL && shm->global_objects[type].shared_alloc)
free_shared_obj(obj, sizeof(struct object));
else
free(obj);
}
struct objhead * get_objhead(enum obj_scope scope, enum objecttype type)
{
struct objhead *head;
if (scope == OBJ_GLOBAL)
head = &shm->global_objects[type];
else {
struct childdata *child;
child = this_child();
if (child == NULL)
return NULL;
head = &child->objects[type];
}
return head;
}
/*
* Fixed capacity for global object arrays. These are allocated in
* MAP_SHARED memory so children can safely read them. Using realloc()
* on private heap would put the new array in the parent's address space
* only, causing children to SIGSEGV when they follow the pointer.
*
* Exposed in objects.h so other code (e.g. mm/maps.c) can use the
* same upper bound when defending against corrupt global lists.
*/
void add_object(struct object *obj, enum obj_scope scope, enum objecttype type)
{
struct objhead *head;
bool was_protected = false;
/* Children must not mutate global objects — the objhead metadata
* is in shared memory but the objects/arrays are in per-process
* heap (COW after fork). Mixing the two corrupts everything. */
if (scope == OBJ_GLOBAL && getpid() != mainpid) {
release_obj(obj, scope, type);
return;
}
if (scope == OBJ_GLOBAL) {
lock(&shm->objlock);
/* Most parent-side OBJ_GLOBAL adds happen during init,
* before freeze. The post-freeze case is fd regeneration
* via try_regenerate_fd() — temporarily lift the RO
* protection so the list/array writes can land. */
if (globals_are_protected()) {
thaw_global_objects();
was_protected = true;
}
}
head = get_objhead(scope, type);
if (head->list == NULL) {
if (scope == OBJ_GLOBAL) {
head->list = alloc_shared_global(sizeof(struct list_head));
} else {
head->list = zmalloc(sizeof(struct list_head));
}
INIT_LIST_HEAD(head->list);
}
list_add_tail(&obj->list, head->list);
/* For global objects, the array was pre-allocated in shared
* memory by init_object_lists(). Never realloc — just reject
* if we've hit the fixed capacity. */
if (scope == OBJ_GLOBAL) {
if (head->num_entries >= head->array_capacity) {
outputerr("add_object: global array full for type %u "
"(cap %u)\n", type, head->array_capacity);
list_del(&obj->list);
if (is_fd_type(type)) {
int fd = fd_from_object(obj, type);
if (fd >= 0)
close(fd);
}
release_obj(obj, scope, type);
goto out_unlock;
}
} else if (head->num_entries >= head->array_capacity) {
/* Local objects: grow via realloc on private heap. */
struct object **newarray;
unsigned int newcap;
newcap = head->array_capacity ? head->array_capacity * 2 : 16;
newarray = realloc(head->array, newcap * sizeof(struct object *));
if (newarray == NULL) {
outputerr("add_object: realloc failed for type %u (cap %u)\n",
type, newcap);
list_del(&obj->list);
if (is_fd_type(type)) {
int fd = fd_from_object(obj, type);
if (fd >= 0)
close(fd);
}
release_obj(obj, scope, type);
return;
}
head->array = newarray;
head->array_capacity = newcap;
}
head->array[head->num_entries] = obj;
obj->array_idx = head->num_entries;
/*
* RELEASE-publish the new count so a child doing a lockless
* ACQUIRE-load in get_random_object() that sees count=N+1 also
* sees the array[N] = obj write that preceded it. For OBJ_LOCAL
* the pool is per-child private, so a plain store suffices.
*/
if (scope == OBJ_GLOBAL)
__atomic_store_n(&head->num_entries, head->num_entries + 1,
__ATOMIC_RELEASE);
else
head->num_entries++;
/* Track global fd-type objects in the hash table */
if (scope == OBJ_GLOBAL && is_fd_type(type)) {
int fd = fd_from_object(obj, type);
if (!fd_hash_insert(fd, obj, type)) {
unsigned int rollback = head->num_entries - 1;
outputerr("add_object: fd hash full for type %u, dropping fd %d\n",
type, fd);
/*
* Drop the count first so a concurrent lockless child
* read picking up the new snapshot sees the lower
* count and won't index past the (about-to-be-NULLed)
* tail slot. RELEASE pairs with the child's ACQUIRE.
*/
__atomic_store_n(&head->num_entries, rollback,
__ATOMIC_RELEASE);
head->array[rollback] = NULL;
list_del(&obj->list);
if (fd >= 0)
close(fd);
release_obj(obj, scope, type);
goto out_unlock;
}
}
/* Per-object dumps are debug noise at startup (NFUTEXES = 5 * cpus
* identical "futex: 0 owner:0 scope:1" lines, etc.). Gate on -vv.
* dump_childdata() calls head->dump directly for crash diagnostics
* and is unaffected by this gate. */
if (head->dump != NULL && verbosity > 2)
head->dump(obj, scope);
out_unlock:
if (scope == OBJ_GLOBAL) {
if (was_protected)
freeze_global_objects();
unlock(&shm->objlock);
}
/* if we just added something to a child list, check
* to see if we need to do some pruning.
*/
if (scope == OBJ_LOCAL)
prune_objects();
}
void init_object_lists(enum obj_scope scope, struct childdata *child)
{
unsigned int i;
for (i = 0; i < MAX_OBJECT_TYPES; i++) {
struct objhead *head;
if (scope == OBJ_GLOBAL)
head = &shm->global_objects[i];
else {
if (child == NULL)
return;
head = &child->objects[i];
}
head->num_entries = 0;
/* Pre-allocate the list head as an empty self-referential
* sentinel. Without this, add_object lazily allocates on
* first add — meaning any list_for_each on a type that
* never had an object added (e.g. perf events disabled,
* seccomp_notif unsupported, etc.) dereferences NULL and
* SIGSEGVs the caller. In production builds the catch-all
* sighandler swallows the SIGSEGV into _exit(EXIT_SUCCESS),
* so the crash is silent and impossible to attribute. */
if (scope == OBJ_GLOBAL)
head->list = alloc_shared_global(sizeof(struct list_head));
else
head->list = zmalloc(sizeof(struct list_head));
INIT_LIST_HEAD(head->list);
if (scope == OBJ_GLOBAL) {
/* Pre-allocate the parallel array in MAP_SHARED memory
* so children can safely read it. Never realloc.
* Tagged global so freeze_global_objects() will mprotect
* it RO once init is done. */
head->array = alloc_shared_global(GLOBAL_OBJ_MAX_CAPACITY *
sizeof(struct object *));
memset(head->array, 0, GLOBAL_OBJ_MAX_CAPACITY *
sizeof(struct object *));
head->array_capacity = GLOBAL_OBJ_MAX_CAPACITY;
} else {
head->array = NULL;
head->array_capacity = 0;
}
/*
* child lists can inherit properties from global lists.
*/
if (scope == OBJ_LOCAL) {
struct objhead *globalhead;
globalhead = &shm->global_objects[i];
head->max_entries = globalhead->max_entries;
head->destroy = globalhead->destroy;
head->dump = globalhead->dump;
}
}
}
/*
* Pick a random object from a pool.
*
* Lockless child read path (OBJ_GLOBAL):
* Children must NOT take shm->objlock here. Doing so deadlocks the
* fleet whenever a child is killed mid-syscall while holding objlock —
* the parent's reaper then blocks forever waiting for the dead child
* to release a lock it can never release. The defensive pid_alive()
* bypass added in e4e32ff0 (zombie pid_alive) papered over one
* instance of this; eliminating the lock acquisition on the child
* read path closes the whole class. Audit (task 4LSD-ae2QTmkKyPKHPo7hQ)
* identified 23 HIGH sites where children reach this lock; this fix
* collapses the entire category-A cluster (get_random_object on the
* syscall arg-pickers' hot path).
*
* Memory ordering:
* The child snapshots head->num_entries with __ATOMIC_ACQUIRE,
* pairing with the parent mutators (add_object, __destroy_object)
* that publish updates with __ATOMIC_RELEASE. Acquire/release
* guarantees that if the child observes count = N+1, it also
* observes the parent's array[N] = obj store that preceded the
* count bump. Without this pairing, a child could pick an index
* into a slot whose backing store hadn't yet propagated.
* Modeled on fd_hash_lookup() (objects.c:159) which uses the same
* pattern for the parallel fd hash table.
*
* Worst-case race:
* The child reads array[idx] without taking objlock, so it can read
* a stale pointer that the parent is concurrently overwriting (swap-
* with-last in __destroy_object) or whose target object the parent
* has just free()d. This is the SAME failure mode as the existing
* "OBJ_GLOBAL objects allocated in parent heap break for children"
* problem tracked in trinity-todo.md (item: OBJ_GLOBAL pool entries
* allocated in parent heap break for children) — the structural fix
* is to allocate the struct objects themselves in shared memory.
* Until that lands, the caller validates the returned pointer and
* the catch-all sighandler turns any raw deref crash into _exit;
* we are NOT making it worse, only widening an existing window.
*
* Why lockless is safe enough:
* 1. Parent mutators run while shm->global_objects is mprotect-thawed
* and re-freeze on completion — the array memory itself isn't
* remapped or relocated under the child (capacity is fixed at
* init, GLOBAL_OBJ_MAX_CAPACITY).
* 2. ACQUIRE/RELEASE on num_entries gives a consistent (count, slots)
* pair w.r.t. the most recent publish.
* 3. The remaining race (stale array[idx] pointer) is upper-bounded
* by the OBJ_GLOBAL-in-parent-heap problem and addressed by the
* separately-tracked structural fix.
*/
struct object * get_random_object(enum objecttype type, enum obj_scope scope)
{
struct objhead *head;
struct object *obj;
head = get_objhead(scope, type);
if (scope == OBJ_GLOBAL && getpid() != mainpid) {
unsigned int snapshot;
snapshot = __atomic_load_n(&head->num_entries,
__ATOMIC_ACQUIRE);
if (snapshot == 0)
return NULL;
return head->array[rand() % snapshot];
}
if (scope == OBJ_GLOBAL)
lock(&shm->objlock);
if (head->num_entries == 0)
obj = NULL;
else
obj = head->array[rand() % head->num_entries];
if (scope == OBJ_GLOBAL)
unlock(&shm->objlock);
return obj;
}
bool objects_empty(enum objecttype type)
{
return shm->global_objects[type].num_entries == 0;
}
/*
* Invalidate the fd stored in an object by setting it to -1.
* Used before calling the destructor when the fd was already closed
* (e.g. after a successful close() syscall) to prevent double-close.
* The destructor's close(-1) call will harmlessly return EBADF.
*/
static void invalidate_object_fd(struct object *obj, enum objecttype type)
{
switch (type) {
case OBJ_FD_PIPE: obj->pipeobj.fd = -1; break;
case OBJ_FD_DEVFILE: obj->fileobj.fd = -1; break;
case OBJ_FD_PROCFILE: obj->fileobj.fd = -1; break;
case OBJ_FD_SYSFILE: obj->fileobj.fd = -1; break;
case OBJ_FD_PERF: obj->perfobj.fd = -1; break;
case OBJ_FD_EPOLL: obj->epollobj.fd = -1; break;
case OBJ_FD_EVENTFD: obj->eventfdobj.fd = -1; break;
case OBJ_FD_TIMERFD: obj->timerfdobj.fd = -1; break;
case OBJ_FD_TESTFILE: obj->testfileobj.fd = -1; break;
case OBJ_FD_MEMFD: obj->memfdobj.fd = -1; break;
case OBJ_FD_DRM: obj->drmfd = -1; break;
case OBJ_FD_INOTIFY: obj->inotifyobj.fd = -1; break;
case OBJ_FD_SOCKET: obj->sockinfo.fd = -1; break;
case OBJ_FD_USERFAULTFD: obj->userfaultobj.fd = -1; break;
case OBJ_FD_FANOTIFY: obj->fanotifyobj.fd = -1; break;
case OBJ_FD_BPF_MAP: obj->bpfobj.map_fd = -1; break;
case OBJ_FD_BPF_PROG: obj->bpfprogobj.fd = -1; break;
case OBJ_FD_BPF_LINK: obj->bpflinkobj.fd = -1; break;
case OBJ_FD_BPF_BTF: obj->bpfbtfobj.fd = -1; break;
case OBJ_FD_IO_URING: obj->io_uringobj.fd = -1; break;
case OBJ_FD_LANDLOCK: obj->landlockobj.fd = -1; break;
case OBJ_FD_PIDFD: obj->pidfdobj.fd = -1; break;
case OBJ_FD_MQ: obj->mqobj.fd = -1; break;
case OBJ_FD_SECCOMP_NOTIF: obj->seccomp_notifobj.fd = -1; break;
case OBJ_FD_IOMMUFD: obj->iommufdobj.fd = -1; break;
case OBJ_FD_FS_CTX: obj->fsctxobj.fd = -1; break;
default: break;
}
}
/*
* Call the destructor for this object, and then release it.
* Internal version — caller must hold objlock if operating on globals.
*
* If already_closed is true, the fd has already been closed by the
* kernel (e.g. after a successful close() syscall). We invalidate
* the fd in the object so the destructor's close() call is a harmless
* no-op, while any other cleanup (munmap, free, etc.) still runs.
*/
static void __destroy_object(struct object *obj, enum obj_scope scope,
enum objecttype type, bool already_closed)
{
struct objhead *head;
unsigned int idx, last;
list_del(&obj->list);
head = get_objhead(scope, type);
/* Swap-with-last removal from the parallel array */
idx = obj->array_idx;
last = head->num_entries - 1;
if (idx != last) {
head->array[idx] = head->array[last];
if (head->array[idx] != NULL)
head->array[idx]->array_idx = idx;
}
head->array[last] = NULL;
/*
* Publish the new count with RELEASE semantics so a concurrent
* lockless child read in get_random_object() that observes the
* shrunk count cannot also observe an inconsistent earlier state
* of the array slots. See the design comment above
* get_random_object(). __prune_objects(OBJ_GLOBAL) is currently
* disabled but routes through here, so this also covers it
* defensively.
*/
if (scope == OBJ_GLOBAL)
__atomic_store_n(&head->num_entries, last, __ATOMIC_RELEASE);
else
head->num_entries--;
/* Remove from fd hash table */
if (scope == OBJ_GLOBAL && is_fd_type(type))
fd_hash_remove(fd_from_object(obj, type));
if (already_closed && is_fd_type(type))
invalidate_object_fd(obj, type);
if (head->destroy != NULL)
head->destroy(obj);
release_obj(obj, scope, type);
}
void destroy_object(struct object *obj, enum obj_scope scope, enum objecttype type)
{
bool was_protected = false;
if (scope == OBJ_GLOBAL && getpid() != mainpid)
return;
if (scope == OBJ_GLOBAL) {
lock(&shm->objlock);
if (globals_are_protected()) {
thaw_global_objects();
was_protected = true;
}
}
__destroy_object(obj, scope, type, false);
if (scope == OBJ_GLOBAL) {
if (was_protected)
freeze_global_objects();
unlock(&shm->objlock);
}
}
/*
* Destroy a whole list of objects.
*/
static void destroy_objects(enum objecttype type, enum obj_scope scope)
{
struct list_head *node, *list, *tmp;
struct objhead *head;
head = get_objhead(scope, type);
if (head->num_entries == 0)
return;
list = head->list;
if (list == NULL)
return;
list_for_each_safe(node, tmp, list) {
struct object *obj;
obj = (struct object *) node;
__destroy_object(obj, scope, type, false);
}
head->num_entries = 0;
/* Only free private-heap arrays (OBJ_LOCAL). OBJ_GLOBAL arrays
* were allocated with alloc_shared() and cannot be freed. */
if (scope == OBJ_LOCAL) {
free(head->array);
head->array = NULL;
head->array_capacity = 0;
free(head->list);
head->list = NULL;
} else {
/* Zero out the shared array for reuse. */
memset(head->array, 0, head->array_capacity * sizeof(struct object *));
}
}
/* Destroy all global objects on exit. */
void destroy_global_objects(void)
{
unsigned int i;
/* The list heads and parallel arrays were mprotected RO after
* init. Cleanup needs to mutate them, so re-enable writes in
* this process first. Children are gone by the time we get
* here so we do not need to coordinate with them. */
thaw_global_objects();
for (i = 0; i < MAX_OBJECT_TYPES; i++)
destroy_objects(i, OBJ_GLOBAL);
}
/*
* Store an fd into the appropriate union field for this object type.
* The inverse of fd_from_object(); used by the generic post-hook that
* registers fds returned by RET_FD syscalls without a custom handler.
*/
void set_object_fd(struct object *obj, enum objecttype type, int fd)
{
switch (type) {
case OBJ_FD_PIPE: obj->pipeobj.fd = fd; break;
case OBJ_FD_DEVFILE:
case OBJ_FD_PROCFILE:
case OBJ_FD_SYSFILE: obj->fileobj.fd = fd; break;
case OBJ_FD_PERF: obj->perfobj.fd = fd; break;
case OBJ_FD_EPOLL: obj->epollobj.fd = fd; break;
case OBJ_FD_EVENTFD: obj->eventfdobj.fd = fd; break;
case OBJ_FD_TIMERFD: obj->timerfdobj.fd = fd; break;
case OBJ_FD_TESTFILE: obj->testfileobj.fd = fd; break;
case OBJ_FD_MEMFD: obj->memfdobj.fd = fd; break;
case OBJ_FD_DRM: obj->drmfd = fd; break;
case OBJ_FD_INOTIFY: obj->inotifyobj.fd = fd; break;
case OBJ_FD_SOCKET: obj->sockinfo.fd = fd; break;
case OBJ_FD_USERFAULTFD: obj->userfaultobj.fd = fd; break;
case OBJ_FD_FANOTIFY: obj->fanotifyobj.fd = fd; break;
case OBJ_FD_BPF_MAP: obj->bpfobj.map_fd = fd; break;
case OBJ_FD_BPF_PROG: obj->bpfprogobj.fd = fd; break;
case OBJ_FD_BPF_LINK: obj->bpflinkobj.fd = fd; break;
case OBJ_FD_BPF_BTF: obj->bpfbtfobj.fd = fd; break;
case OBJ_FD_IO_URING: obj->io_uringobj.fd = fd; break;
case OBJ_FD_LANDLOCK: obj->landlockobj.fd = fd; break;
case OBJ_FD_PIDFD: obj->pidfdobj.fd = fd; break;
case OBJ_FD_MQ: obj->mqobj.fd = fd; break;
case OBJ_FD_SECCOMP_NOTIF: obj->seccomp_notifobj.fd = fd; break;
case OBJ_FD_IOMMUFD: obj->iommufdobj.fd = fd; break;
case OBJ_FD_FS_CTX: obj->fsctxobj.fd = fd; break;
default: break;
}
}
/*
* Linear search the per-child OBJ_LOCAL pool of one type for an fd.
* Used by the generic post-hook to detect fds that a syscall-specific
* post handler already registered, so we don't double-track them.
* O(n) over a small n (typically tens of entries).
*/
struct object *find_local_object_by_fd(enum objecttype type, int fd)
{
struct objhead *head;
unsigned int i;
if (fd < 0)
return NULL;
head = get_objhead(OBJ_LOCAL, type);
if (head == NULL || head->num_entries == 0)
return NULL;
for (i = 0; i < head->num_entries; i++) {
struct object *obj = head->array[i];
if (obj != NULL && fd_from_object(obj, type) == fd)
return obj;
}
return NULL;
}
/*
* Extract the fd from an object, given its type.
* Returns -1 for non-fd object types.
*/
int fd_from_object(struct object *obj, enum objecttype type)
{
switch (type) {
case OBJ_FD_PIPE: return obj->pipeobj.fd;
case OBJ_FD_DEVFILE:
case OBJ_FD_PROCFILE:
case OBJ_FD_SYSFILE: return obj->fileobj.fd;
case OBJ_FD_PERF: return obj->perfobj.fd;
case OBJ_FD_EPOLL: return obj->epollobj.fd;
case OBJ_FD_EVENTFD: return obj->eventfdobj.fd;
case OBJ_FD_TIMERFD: return obj->timerfdobj.fd;
case OBJ_FD_TESTFILE: return obj->testfileobj.fd;
case OBJ_FD_MEMFD: return obj->memfdobj.fd;
case OBJ_FD_DRM: return obj->drmfd;
case OBJ_FD_INOTIFY: return obj->inotifyobj.fd;
case OBJ_FD_SOCKET: return obj->sockinfo.fd;
case OBJ_FD_USERFAULTFD: return obj->userfaultobj.fd;
case OBJ_FD_FANOTIFY: return obj->fanotifyobj.fd;
case OBJ_FD_BPF_MAP: return obj->bpfobj.map_fd;
case OBJ_FD_BPF_PROG: return obj->bpfprogobj.fd;
case OBJ_FD_BPF_LINK: return obj->bpflinkobj.fd;
case OBJ_FD_BPF_BTF: return obj->bpfbtfobj.fd;
case OBJ_FD_IO_URING: return obj->io_uringobj.fd;
case OBJ_FD_LANDLOCK: return obj->landlockobj.fd;
case OBJ_FD_PIDFD: return obj->pidfdobj.fd;
case OBJ_FD_MQ: return obj->mqobj.fd;
case OBJ_FD_SECCOMP_NOTIF: return obj->seccomp_notifobj.fd;
case OBJ_FD_IOMMUFD: return obj->iommufdobj.fd;
case OBJ_FD_FS_CTX: return obj->fsctxobj.fd;
default: return -1;
}
}
/*
* Look up an fd in the hash table and destroy its object.
* Called from fd_event_drain() after a child reported a close or dup2.
*
* The child closed its own copy of the fd (children have independent
* fd tables after fork). The parent's copy is still open and must be
* closed here — pass already_closed=false so the destructor runs
* close() on the parent's fd. Without this, every child close event
* leaks one fd in the parent, leading to fd exhaustion.
*/
void remove_object_by_fd(int fd)
{
struct fd_hash_entry *entry;
struct object *obj;
enum objecttype type;
bool was_protected = false;
if (getpid() != mainpid)
return;
lock(&shm->objlock);
if (globals_are_protected()) {
thaw_global_objects();
was_protected = true;
}
entry = fd_hash_lookup(fd);
if (entry == NULL) {
if (was_protected)
freeze_global_objects();
unlock(&shm->objlock);
return;
}
obj = entry->obj;
type = entry->type;
__atomic_add_fetch(&shm->stats.fd_closed_tracked, 1, __ATOMIC_RELAXED);
__destroy_object(obj, OBJ_GLOBAL, type, false);
unlock(&shm->objlock);
/* try_regenerate_fd() may call add_object() which sees the
* thawed state (globals_are_protected() returns false here)
* and skips its own thaw/refreeze. We refreeze afterwards
* so the regeneration's writes stay covered by our window. */
try_regenerate_fd(type);
if (was_protected)
freeze_global_objects();
}
static void __prune_objects(enum objecttype type, enum obj_scope scope)
{
struct objhead *head;
struct list_head *node, *list, *tmp;
head = get_objhead(scope, type);
/* 0 = don't ever prune. */
if (head->max_entries == 0)
return;
/* only prune full lists. */
if (head->num_entries < head->max_entries)
return;
/* Single pass: prune each entry with 1/10 probability. */
list = head->list;
list_for_each_safe(node, tmp, list) {
if (ONE_IN(10)) {
struct object *obj = (struct object *) node;
destroy_object(obj, scope, type);
}
}
}
void prune_objects(void)
{
unsigned int i;
/* We don't want to over-prune things and growing a little
* bit past the ->max is fine, we'll clean it up next time.
*/
if (!(ONE_IN(10)))
return;
for (i = 0; i < MAX_OBJECT_TYPES; i++) {
__prune_objects(i, OBJ_LOCAL);
// For now, we're only pruning local objects.
// __prune_objects(i, OBJ_GLOBAL);
}
}