2freeman

Keep growing with every tiny trival.

The CVE-2017-8890, which founded by ADLab on June 2017, had been lurking for 11 years in the Linux kernel net subsystem. The ADLab writes an article to explain it[1]. After reading the post, I know that it is a double free bug. But what puzzles me is that how the memory is freed for the first time. In order to address it, I read the source code and write this blog to explain it.

Analysis

Firstly, let’s look at the patch[2] committed to the upstream:

dccp/tcp: do not inherit mc_list from parent syzkaller found a way to trigger double frees from ip_mc_drop_socket() It turns out that leave a copy of parent mc_list at accept() time, which is very bad.

@@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
		/* listeners have SOCK_RCU_FREE, not the children */
		sock_reset_flag(newsk, SOCK_RCU_FREE);
							 
+		inet_sk(newsk)->mc_list = NULL;
+
		newsk->sk_mark = inet_rsk(req)->ir_mark;
		atomic64_set(&newsk->sk_cookie,
		atomic64_read(&inet_rsk(req)->ir_cookie));

We know that the parent mc_list is double freed because of leaving a copy of it at accept() time. Let’s inspect accept(). All the analysis bases on TCP/IPv4.

//accept()
sys_accept4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen, int, flags)
{
	struct socket *sock, *newsock;

	//---- skip ----

	newsock = sock_alloc();
	if (!newsock)
		goto out_put;

	//---- skip ----

	// 1. get new file fd
	newfd = get_unused_fd_flags(flags);
	if (unlikely(newfd < 0)) {
		err = newfd;
		sock_release(newsock);
		goto out_put;
	}
	// 2. allocate new file with newsock
	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
	if (unlikely(IS_ERR(newfile))) {
		err = PTR_ERR(newfile);
		put_unused_fd(newfd);
		sock_release(newsock);
		goto out_put;
	}

	//---- skip ----

	// 3. main accept function
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err < 0)
		goto out_fd;

	//---- skip ----

	// 4. add newfd to current process
	fd_install(newfd, newfile);
}

A newsock is created in accept(), and the relationship between it and sock is established in 3:

//sock->ops->accept
inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1 = sock->sk;
	int err = -EINVAL;
	
	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);

	//---- skip ----

	sock_graft(sk2, newsock);

	newsock->state = SS_CONNECTED;
	err = 0;
}

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	struct sock *newsk;
	struct request_sock *req;
	int error;

	lock_sock(sk);

	/* We need to make sure that this socket is listening,
	 * and that it has something pending.
	 */
	error = -EINVAL;
	if (sk->sk_state != TCP_LISTEN)
		goto out_err;

	/* Find already established connection */
	if (reqsk_queue_empty(queue)) {
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);

		/* If this is a non blocking socket don't sleep */
		error = -EAGAIN;
		if (!timeo)
			goto out_err;

		error = inet_csk_wait_for_connect(sk, timeo);
		if (error)
			goto out_err;
	}
	req = reqsk_queue_remove(queue);
	newsk = req->sk;

	//---- skip ----

out:
	release_sock(sk);
	if (req)
		__reqsk_free(req);
	return newsk;
}

The main job of inet_csk_accept() is to fetch a request sock from icks->icsk_accept_queue. If there is no request, it may be blocked until the request available. When does a request join the queue? Before answering this question, I want to talk about how a datagram is processed by network protocol stack. When a datagram is coming, it will be processed from the bottom of stack to the top. The handler at the Network Layer is ip_rcv(). When its work is done, it will pass the data to upper layer according to the value of protocol in the header. As you know, The target protocol is TCP and its handler is tcp_v4_rcv(). We may find the answer in it.

int tcp_v4_rcv(struct sk_buff *skb)
{
	//---- skip ----

	th = tcp_hdr(skb);
	iph = ip_hdr(skb);
	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
				    skb->len - th->doff * 4);
	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
	TCP_SKB_CB(skb)->when	 = 0;
	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
	TCP_SKB_CB(skb)->sacked	 = 0;

	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
	if (!sk)
		goto no_tcp_socket;

process:
	
	//---- skip ----

	if (!sock_owned_by_user(sk)) {
#ifdef CONFIG_NET_DMA
		struct tcp_sock *tp = tcp_sk(sk);
		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
			tp->ucopy.dma_chan = net_dma_find_channel();
		if (tp->ucopy.dma_chan)
			ret = tcp_v4_do_rcv(sk, skb);
		else
#endif
		{
			if (!tcp_prequeue(sk, skb))
				ret = tcp_v4_do_rcv(sk, skb);
		}
	} 

	//---- skip ----
}

The __inet_lookup_skb() searches the target sock according to the th->dest. Actually, the target sock’s role in programming is server. And the request sock’s role is client.

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	//---- skip ----

	if (sk->sk_state == TCP_LISTEN) {
		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
		if (!nsk)
			goto discard;

		if (nsk != sk) {
			sock_rps_save_rxhash(nsk, skb);
			if (tcp_child_process(sk, nsk, skb)) {
				rsk = nsk;
				goto reset;
			}
			return 0;
		}
	} else
		sock_rps_save_rxhash(sk, skb);

	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
		rsk = sk;
		goto reset;
	}

	//---- skip ----
}

At this point, the server sock’s status is TCP_LISTEN. And it’s status does not change because it is always used to listen for connection requests. We know that the first request in 3-way handshake is SYN. So the tcp_v4_hnd_req() can’t find established sock and return original sock.

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
			  const struct tcphdr *th, unsigned int len)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct request_sock *req;
	int queued = 0;

	tp->rx_opt.saw_tstamp = 0;

	switch (sk->sk_state) {
	case TCP_CLOSE:
		goto discard;

	case TCP_LISTEN:
		if (th->ack)
			return 1;

		if (th->rst)
			goto discard;

		if (th->syn) {
			if (th->fin)
				goto discard;
			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
				return 1;
			kfree_skb(skb);
			return 0;
		}

	//---- skip ----
}

/*
 * icsk->icsk_af_ops->conn_request
 * |-->tcp_v4_conn_request()
 *	   |-->tcp_v4_conn_req_fastopen()
 */
static int tcp_v4_conn_req_fastopen(struct sock *sk,
				    struct sk_buff *skb,
				    struct sk_buff *skb_synack,
				    struct request_sock *req)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
	const struct inet_request_sock *ireq = inet_rsk(req);
	struct sock *child;
	int err;

	req->num_retrans = 0;
	req->num_timeout = 0;
	req->sk = NULL;

	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
	if (child == NULL) {
		NET_INC_STATS_BH(sock_net(sk),
				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
		kfree_skb(skb_synack);
		return -1;
	}

	//---- skip ----

	/* Add the child socket directly into the accept queue */
	inet_csk_reqsk_queue_add(sk, req, child);

	//---- skip ----
}

/*
 * tcp_v4_conn_req_fastopen()
 * |-->inet_csk(sk)->icsk_af_ops->syn_recv_sock()
 *		|-->tcp_v4_syn_recv_sock()
 *			|-->tcp_create_openreq_child
 *				|-->inet_csk_clone_lock()
 */
struct sock *inet_csk_clone_lock(const struct sock *sk,
				 const struct request_sock *req,
				 const gfp_t priority)
{
	struct sock *newsk = sk_clone_lock(sk, priority);

	if (newsk != NULL) {
		struct inet_connection_sock *newicsk = inet_csk(newsk);

		newsk->sk_state = TCP_SYN_RECV;
		newicsk->icsk_bind_hash = NULL;

		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
		newsk->sk_write_space = sk_stream_write_space;

		/****** patch here ******/
		inet_sk(newsk)->mc_list = NULL;

		newsk->sk_mark = inet_rsk(req)->ir_mark;

		newicsk->icsk_retransmits = 0;
		newicsk->icsk_backoff	  = 0;
		newicsk->icsk_probes_out  = 0;

		/* Deinitialize accept_queue to trap illegal accesses. */
		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));

		security_inet_csk_clone(newsk, req);
	}
	return newsk;
}

The inet_csk_clone_lock() clones newsk from sk. However, it does not clean inet_sk(newsk)->mc_list, so it leaves a copy of parent mc_list.

/* tcp_v4_conn_req_fastopen()
 * |-->inet_csk_reqsk_queue_add()
 *		|-->reqsk_queue_add()
 */
static inline void reqsk_queue_add(struct request_sock_queue *queue,
				   struct request_sock *req,
				   struct sock *parent,
				   struct sock *child)
{
	req->sk = child;

	//---- skip ----
}

Then the reqsk_queue_add() add the child sock to the queue which accept() gets from.

PoC

sockfd = socket(AF_INET, xx, IPPROTO_TCP);
setsockopt(sockfd, SOL_IP, MCAST_JOIN_GROUP, xxxx, xxxx);
bind(sockfd, xxxx, xxxx);
listen(sockfd, xxxx);
newsockfd = accept(sockfd, xxxx, xxxx);
close(newsockfd);// first free (kfree_rcu)
sleep(5);// wait rcu free(real first free)
close(sockfd);// double free

The first free happens in close(newsockfd). But the real place to release is in rcu kthread.

Reference

[1] https://mp.weixin.qq.com/s/6NGH-Dk2n_BkdlJ2jSMWJQ

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=657831ffc38e30092a2d5f03d385d710eb88b09a