Ignore unreliable DNS servers.

Collect statistics about DNS query success state and delay. Ignore
servers that have been tried at least five times and have a success rate
of < 0.25. Retry these servers once every 180s.

Bug: 25731675
Change-Id: I78e24f43e388dca82fb81835e1796f4c7dce8da3
diff --git a/libc/dns/resolv/res_send.c b/libc/dns/resolv/res_send.c
index 3458e48..bfc6e1a 100644
--- a/libc/dns/resolv/res_send.c
+++ b/libc/dns/resolv/res_send.c
@@ -81,9 +81,6 @@
 #endif
 #endif /* LIBC_SCCS and not lint */
 
-/* set to 1 to use our small/simple/limited DNS cache */
-#define  USE_RESOLV_CACHE  1
-
 /*
  * Send query to name server and wait for reply.
  */
@@ -116,9 +113,7 @@
 
 #include <isc/eventlib.h>
 
-#if USE_RESOLV_CACHE
-#  include <resolv_cache.h>
-#endif
+#include <resolv_cache.h>
 
 #include "private/libc_logging.h"
 
@@ -133,6 +128,7 @@
 #endif
 #include "res_debug.h"
 #include "res_private.h"
+#include "resolv_stats.h"
 
 #define EXT(res) ((res)->_u._ext)
 #define DBG 0
@@ -144,10 +140,12 @@
 static int		get_salen __P((const struct sockaddr *));
 static struct sockaddr * get_nsaddr __P((res_state, size_t));
 static int		send_vc(res_state, const u_char *, int,
-				u_char *, int, int *, int);
+				u_char *, int, int *, int,
+				time_t *, int *, int *);
 static int		send_dg(res_state, const u_char *, int,
 				u_char *, int, int *, int,
-				int *, int *);
+				int *, int *,
+				time_t *, int *, int *);
 static void		Aerror(const res_state, FILE *, const char *, int,
 			       const struct sockaddr *, int);
 static void		Perror(const res_state, FILE *, const char *, int);
@@ -359,23 +357,13 @@
 	return (1);
 }
 
-
 int
 res_nsend(res_state statp,
 	  const u_char *buf, int buflen, u_char *ans, int anssiz)
 {
 	int gotsomewhere, terrno, try, v_circuit, resplen, ns, n;
 	char abuf[NI_MAXHOST];
-#if USE_RESOLV_CACHE
-        ResolvCacheStatus     cache_status = RESOLV_CACHE_UNSUPPORTED;
-#endif
-
-#if !USE_RESOLV_CACHE
-	if (statp->nscount == 0) {
-		errno = ESRCH;
-		return (-1);
-	}
-#endif
+	ResolvCacheStatus     cache_status = RESOLV_CACHE_UNSUPPORTED;
 
 	if (anssiz < HFIXEDSZ) {
 		errno = EINVAL;
@@ -387,7 +375,6 @@
 	gotsomewhere = 0;
 	terrno = ETIMEDOUT;
 
-#if USE_RESOLV_CACHE
 	int  anslen = 0;
 	cache_status = _resolv_cache_lookup(
 			statp->netid, buf, buflen,
@@ -400,7 +387,6 @@
 		// data so the normal resolve path can do its thing
 		_resolv_populate_res_for_net(statp);
 	}
-
 	if (statp->nscount == 0) {
 		// We have no nameservers configured, so there's no point trying.
 		// Tell the cache the query failed, or any retries and anyone else asking the same
@@ -409,7 +395,6 @@
 		errno = ESRCH;
 		return (-1);
 	}
-#endif
 
 	/*
 	 * If the ns_addr_list in the resolver context has changed, then
@@ -420,9 +405,9 @@
 		struct sockaddr_storage peer;
 		socklen_t peerlen;
 
-		if (EXT(statp).nscount != statp->nscount)
+		if (EXT(statp).nscount != statp->nscount) {
 			needclose++;
-		else
+		} else {
 			for (ns = 0; ns < statp->nscount; ns++) {
 				if (statp->nsaddr_list[ns].sin_family &&
 				    !sock_eq((struct sockaddr *)(void *)&statp->nsaddr_list[ns],
@@ -445,6 +430,7 @@
 					break;
 				}
 			}
+		}
 		if (needclose) {
 			res_nclose(statp);
 			EXT(statp).nscount = 0;
@@ -485,7 +471,7 @@
 		nstime = EXT(statp).nstimes[0];
 		for (ns = 0; ns < lastns; ns++) {
 			if (EXT(statp).ext != NULL)
-                                EXT(statp).ext->nsaddrs[ns] =
+				EXT(statp).ext->nsaddrs[ns] =
 					EXT(statp).ext->nsaddrs[ns + 1];
 			statp->nsaddr_list[ns] = statp->nsaddr_list[ns + 1];
 			EXT(statp).nssocks[ns] = EXT(statp).nssocks[ns + 1];
@@ -502,13 +488,24 @@
 	 * Send request, RETRY times, or until successful.
 	 */
 	for (try = 0; try < statp->retry; try++) {
+	    struct __res_stats stats[MAXNS];
+	    struct __res_params params;
+	    int revision_id = _resolv_cache_get_resolver_stats(statp->netid, &params, stats);
+	    bool usable_servers[MAXNS];
+	    _res_stats_get_usable_servers(&params, stats, statp->nscount, usable_servers);
+
 	    for (ns = 0; ns < statp->nscount; ns++) {
+		if (!usable_servers[ns]) continue;
 		struct sockaddr *nsap;
 		int nsaplen;
+		time_t now = 0;
+		int rcode = RCODE_INTERNAL_ERROR;
+		int delay = 0;
 		nsap = get_nsaddr(statp, (size_t)ns);
 		nsaplen = get_salen(nsap);
 		statp->_flags &= ~RES_F_LASTMASK;
 		statp->_flags |= (ns << RES_F_LASTSHIFT);
+
  same_ns:
 		if (statp->qhook) {
 			int done = 0, loops = 0;
@@ -552,7 +549,12 @@
 			try = statp->retry;
 
 			n = send_vc(statp, buf, buflen, ans, anssiz, &terrno,
-				    ns);
+				    ns, &now, &rcode, &delay);
+
+			struct __res_sample sample;
+			_res_stats_set_sample(&sample, now, rcode, delay);
+			_resolv_cache_add_resolver_stats_sample(statp->netid, revision_id, ns,
+				&sample, params.max_samples);
 
 			if (DBG) {
 				__libc_format_log(ANDROID_LOG_DEBUG, "libc",
@@ -571,7 +573,13 @@
 			}
 
 			n = send_dg(statp, buf, buflen, ans, anssiz, &terrno,
-				    ns, &v_circuit, &gotsomewhere);
+				    ns, &v_circuit, &gotsomewhere, &now, &rcode, &delay);
+
+			struct __res_sample sample;
+			_res_stats_set_sample(&sample, now, rcode, delay);
+			_resolv_cache_add_resolver_stats_sample(statp->netid, revision_id, ns,
+				&sample, params.max_samples);
+
 			if (DBG) {
 				__libc_format_log(ANDROID_LOG_DEBUG, "libc", "used send_dg %d\n",n);
 			}
@@ -582,7 +590,7 @@
 				goto next_ns;
 			if (DBG) {
 				__libc_format_log(ANDROID_LOG_DEBUG, "libc", "time=%ld\n",
-                                                  time(NULL));
+						  time(NULL));
 			}
 			if (v_circuit)
 				goto same_ns;
@@ -599,12 +607,10 @@
 			(stdout, "%s", ""),
 			ans, (resplen > anssiz) ? anssiz : resplen);
 
-#if USE_RESOLV_CACHE
-                if (cache_status == RESOLV_CACHE_NOTFOUND) {
-                    _resolv_cache_add(statp->netid, buf, buflen,
-                                      ans, resplen);
-                }
-#endif
+		if (cache_status == RESOLV_CACHE_NOTFOUND) {
+		    _resolv_cache_add(statp->netid, buf, buflen,
+				      ans, resplen);
+		}
 		/*
 		 * If we have temporarily opened a virtual circuit,
 		 * or if we haven't been asked to keep a socket open,
@@ -656,15 +662,12 @@
 	} else
 		errno = terrno;
 
-#if USE_RESOLV_CACHE
-        _resolv_cache_query_failed(statp->netid, buf, buflen);
-#endif
+	_resolv_cache_query_failed(statp->netid, buf, buflen);
 
 	return (-1);
  fail:
-#if USE_RESOLV_CACHE
+
 	_resolv_cache_query_failed(statp->netid, buf, buflen);
-#endif
 	res_nclose(statp);
 	return (-1);
 }
@@ -735,8 +738,11 @@
 static int
 send_vc(res_state statp,
 	const u_char *buf, int buflen, u_char *ans, int anssiz,
-	int *terrno, int ns)
+	int *terrno, int ns, time_t* at, int* rcode, int* delay)
 {
+	*at = 0;
+	*rcode = RCODE_INTERNAL_ERROR;
+	*delay = 0;
 	const HEADER *hp = (const HEADER *)(const void *)buf;
 	HEADER *anhp = (HEADER *)(void *)ans;
 	struct sockaddr *nsap;
@@ -758,6 +764,8 @@
  same_ns:
 	truncating = 0;
 
+	struct timespec now = evNowTime();
+
 	/* Are we still talking to whom we want to talk to? */
 	if (statp->_vcsock >= 0 && (statp->_flags & RES_F_VC) != 0) {
 		struct sockaddr_storage peer;
@@ -800,7 +808,7 @@
 		}
 		if (statp->_mark != MARK_UNSET) {
 			if (setsockopt(statp->_vcsock, SOL_SOCKET,
-				        SO_MARK, &statp->_mark, sizeof(statp->_mark)) < 0) {
+				    SO_MARK, &statp->_mark, sizeof(statp->_mark)) < 0) {
 				*terrno = errno;
 				Perror(statp, stderr, "setsockopt", errno);
 				return -1;
@@ -820,6 +828,15 @@
 			Aerror(statp, stderr, "connect/vc", errno, nsap,
 			    nsaplen);
 			res_nclose(statp);
+			/*
+			 * The way connect_with_timeout() is implemented prevents us from reliably
+			 * determining whether this was really a timeout or e.g. ECONNREFUSED. Since
+			 * currently both cases are handled in the same way, there is no need to
+			 * change this (yet). If we ever need to reliably distinguish between these
+			 * cases, both connect_with_timeout() and retrying_select() need to be
+			 * modified, though.
+			 */
+			*rcode = RCODE_TIMEOUT;
 			return (0);
 		}
 		statp->_flags |= RES_F_VC;
@@ -900,6 +917,10 @@
 		res_nclose(statp);
 		return (0);
 	}
+
+	struct timespec done = evNowTime();
+	*at = done.tv_sec;
+
 	if (truncating) {
 		/*
 		 * Flush rest of answer so connection stays in synch.
@@ -936,6 +957,10 @@
 	 * All is well, or the error is fatal.  Signal that the
 	 * next nameserver ought not be tried.
 	 */
+	if (resplen > 0) {
+	    *delay = _res_stats_calculate_rtt(&done, &now);
+	    *rcode = anhp->rcode;
+	}
 	return (resplen);
 }
 
@@ -952,8 +977,8 @@
 
 	res = __connect(sock, nsap, salen);
 	if (res < 0 && errno != EINPROGRESS) {
-                res = -1;
-                goto done;
+		res = -1;
+		goto done;
 	}
 	if (res != 0) {
 		now = evNowTime();
@@ -965,7 +990,7 @@
 
 		res = retrying_select(sock, &rset, &wset, &finish);
 		if (res <= 0) {
-                        res = -1;
+			res = -1;
 		}
 	}
 done:
@@ -987,7 +1012,7 @@
 
 retry:
 	if (DBG) {
-		__libc_format_log(ANDROID_LOG_DEBUG, "libc", "  %d retying_select\n", sock);
+		__libc_format_log(ANDROID_LOG_DEBUG, "libc", "  %d retrying_select\n", sock);
 	}
 
 	now = evNowTime();
@@ -1042,17 +1067,20 @@
 	return n;
 }
 
-
 static int
 send_dg(res_state statp,
 	const u_char *buf, int buflen, u_char *ans, int anssiz,
-	int *terrno, int ns, int *v_circuit, int *gotsomewhere)
+	int *terrno, int ns, int *v_circuit, int *gotsomewhere,
+	time_t *at, int *rcode, int* delay)
 {
+	*at = 0;
+	*rcode = RCODE_INTERNAL_ERROR;
+	*delay = 0;
 	const HEADER *hp = (const HEADER *)(const void *)buf;
 	HEADER *anhp = (HEADER *)(void *)ans;
 	const struct sockaddr *nsap;
 	int nsaplen;
-	struct timespec now, timeout, finish;
+	struct timespec now, timeout, finish, done;
 	fd_set dsmask;
 	struct sockaddr_storage from;
 	socklen_t fromlen;
@@ -1145,6 +1173,7 @@
 	n = retrying_select(s, &dsmask, NULL, &finish);
 
 	if (n == 0) {
+		*rcode = RCODE_TIMEOUT;
 		Dprint(statp->options & RES_DEBUG, (stdout, ";; timeout\n"));
 		*gotsomewhere = 1;
 		return (0);
@@ -1230,6 +1259,9 @@
 			ans, (resplen > anssiz) ? anssiz : resplen);
 		goto retry;;
 	}
+	done = evNowTime();
+	*at = done.tv_sec;
+	*delay = _res_stats_calculate_rtt(&done, &now);
 	if (anhp->rcode == SERVFAIL ||
 	    anhp->rcode == NOTIMP ||
 	    anhp->rcode == REFUSED) {
@@ -1238,8 +1270,10 @@
 			ans, (resplen > anssiz) ? anssiz : resplen);
 		res_nclose(statp);
 		/* don't retry if called from dig */
-		if (!statp->pfcode)
+		if (!statp->pfcode) {
+			*rcode = anhp->rcode;
 			return (0);
+		}
 	}
 	if (!(statp->options & RES_IGNTC) && anhp->tc) {
 		/*
@@ -1256,6 +1290,9 @@
 	 * All is well, or the error is fatal.  Signal that the
 	 * next nameserver ought not be tried.
 	 */
+	if (resplen > 0) {
+		*rcode = anhp->rcode;
+	}
 	return (resplen);
 }