Ignore unreliable DNS servers.
Collect statistics about DNS query success state and delay. Ignore
servers that have been tried at least five times and have a success rate
of < 0.25. Retry these servers once every 180s.
Bug: 25731675
Change-Id: I78e24f43e388dca82fb81835e1796f4c7dce8da3
diff --git a/libc/dns/include/resolv_netid.h b/libc/dns/include/resolv_netid.h
index d364645..09c5498 100644
--- a/libc/dns/include/resolv_netid.h
+++ b/libc/dns/include/resolv_netid.h
@@ -49,6 +49,7 @@
__BEGIN_DECLS
+struct __res_params;
struct addrinfo;
#define __used_in_netd __attribute__((visibility ("default")))
@@ -86,8 +87,8 @@
const struct android_net_context *, struct addrinfo **) __used_in_netd;
/* set name servers for a network */
-extern void _resolv_set_nameservers_for_net(unsigned netid,
- const char** servers, int numservers, const char *domains) __used_in_netd;
+extern void _resolv_set_nameservers_for_net(unsigned netid, const char** servers, int numservers,
+ const char *domains, const struct __res_params* params) __used_in_netd;
/* flush the cache associated with a certain network */
extern void _resolv_flush_cache_for_net(unsigned netid) __used_in_netd;
diff --git a/libc/dns/include/resolv_params.h b/libc/dns/include/resolv_params.h
new file mode 100644
index 0000000..353ae4d
--- /dev/null
+++ b/libc/dns/include/resolv_params.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _RESOLV_PARAMS_H
+#define _RESOLV_PARAMS_H
+
+#include <stdint.h>
+
+/* Hard-coded defines */
+#define MAXNS 3 /* max # name servers we'll track */
+#define MAXNSSAMPLES 64 /* max # samples to store per server */
+
+/* Defaults used for initializing __res_params */
+#define SUCCESS_THRESHOLD 75 /* if successes * 100 / total_samples is less than
+ * this value, the server is considered failing
+ */
+#define NSSAMPLE_VALIDITY 1800 /* Sample validity in seconds.
+ * Set to -1 to disable skipping failing
+ * servers.
+ */
+
+/* per-netid configuration parameters passed from netd to the resolver */
+struct __res_params {
+ uint16_t sample_validity; // sample lifetime in s
+ // threshold of success / total samples below which a server is considered broken
+ uint8_t success_threshold; // 0: disable, value / 100 otherwise
+ uint8_t min_samples; // min # samples needed for statistics to be considered meaningful
+ uint8_t max_samples; // max # samples taken into account for statistics
+} __attribute__((__packed__));
+
+#endif // _RESOLV_PARAMS_H
diff --git a/libc/dns/include/resolv_private.h b/libc/dns/include/resolv_private.h
index 9484d0e..8cdcc2e 100644
--- a/libc/dns/include/resolv_private.h
+++ b/libc/dns/include/resolv_private.h
@@ -58,7 +58,10 @@
#include <resolv.h>
#include "resolv_static.h"
+#include "resolv_params.h"
+#include "resolv_stats.h"
#include <net/if.h>
+#include <time.h>
/* Despite this file's name, it's part of libresolv. On Android, that means it's part of libc :-( */
#pragma GCC visibility push(default)
@@ -136,7 +139,6 @@
/*
* Global defines and variables for resolver stub.
*/
-#define MAXNS 3 /* max # name servers we'll track */
#define MAXDFLSRCH 3 /* # default domain levels to try */
#define MAXDNSRCH 6 /* max # domains in search path */
#define LOCALDOMAINPARTS 2 /* min levels in name that is "local" */
@@ -205,6 +207,24 @@
typedef struct __res_state *res_state;
+/* Retrieve a local copy of the stats for the given netid. The buffer must have space for
+ * MAXNS __resolver_stats. Returns the revision id of the resolvers used.
+ */
+__LIBC_HIDDEN__
+extern int
+_resolv_cache_get_resolver_stats( unsigned netid, struct __res_params* params,
+ struct __res_stats stats[MAXNS]);
+
+/* Add a sample to the shared struct for the given netid and server, provided that the
+ * revision_id of the stored servers has not changed.
+ */
+__LIBC_HIDDEN__
+extern void
+_resolv_cache_add_resolver_stats_sample( unsigned netid, int revision_id, int ns,
+ const struct __res_sample* sample, int max_samples);
+
+/* End of stats related definitions */
+
union res_sockaddr_union {
struct sockaddr_in sin;
#ifdef IN6ADDR_ANY_INIT
diff --git a/libc/dns/include/resolv_stats.h b/libc/dns/include/resolv_stats.h
new file mode 100644
index 0000000..2aab958
--- /dev/null
+++ b/libc/dns/include/resolv_stats.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _RES_STATS_H
+#define _RES_STATS_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "resolv_params.h"
+
+#define RCODE_INTERNAL_ERROR 254
+#define RCODE_TIMEOUT 255
+
+/*
+ * Resolver reachability statistics and run-time parameters.
+ */
+
+struct __res_sample {
+ time_t at; // time in s at which the sample was recorded
+ uint16_t rtt; // round-trip time in ms
+ uint8_t rcode; // the DNS rcode or RCODE_XXX defined above
+};
+
+struct __res_stats {
+ // Stats of the last <sample_count> queries.
+ struct __res_sample samples[MAXNSSAMPLES];
+ // The number of samples stored.
+ uint8_t sample_count;
+ // The next sample to modify.
+ uint8_t sample_next;
+};
+
+/* Calculate the round-trip-time from start time t0 and end time t1. */
+int
+_res_stats_calculate_rtt(const struct timespec* t1, const struct timespec* t0);
+
+/* Initialize a sample for calculating server reachability statistics. */
+extern void
+_res_stats_set_sample(struct __res_sample* sample, time_t now, int rcode, int rtt);
+
+/* Aggregates the reachability statistics for the given server based on on the stored samples. */
+extern void
+_res_stats_aggregate(struct __res_stats* stats, int* successes, int* errors, int* timeouts,
+ int* internal_errors, int* rtt_avg, time_t* last_sample_time);
+
+/* Returns true if the server is considered unusable, i.e. if the success rate is not lower than the
+ * threshold for the stored stored samples. If not enough samples are stored, the server is
+ * considered usable.
+ */
+extern bool
+_res_stats_usable_server(const struct __res_params* params, struct __res_stats* stats);
+
+/* Returns an array of bools indicating which servers are considered good */
+extern void
+_res_stats_get_usable_servers(const struct __res_params* params, struct __res_stats stats[MAXNS],
+ int nscount, bool valid_servers[MAXNS]);
+
+#endif // _RES_STATS_H
diff --git a/libc/dns/resolv/res_cache.c b/libc/dns/resolv/res_cache.c
index ae8debb..526c91b 100644
--- a/libc/dns/resolv/res_cache.c
+++ b/libc/dns/resolv/res_cache.c
@@ -1234,6 +1234,9 @@
struct resolv_cache_info* next;
char* nameservers[MAXNS +1];
struct addrinfo* nsaddrinfo[MAXNS + 1];
+ int revision_id; // # times the nameservers have been replaced
+ struct __res_params params;
+ struct __res_stats nsstats[MAXNS];
char defdname[256];
int dnsrch_offset[MAXDNSRCH+1]; // offsets into defdname
};
@@ -1361,6 +1364,8 @@
pthread_mutex_unlock(&_res_cache_list_lock);
}
+static struct resolv_cache_info* _find_cache_info_locked(unsigned netid);
+
static void
_cache_flush_locked( Cache* cache )
{
@@ -1801,6 +1806,8 @@
* currently attached to the provided cache_info */
static int _resolv_is_nameservers_equal_locked(struct resolv_cache_info* cache_info,
const char** servers, int numservers);
+/* clears the stats samples contained withing the given cache_info */
+static void _res_cache_clear_stats_locked(struct resolv_cache_info* cache_info);
static void
_res_cache_init(void)
@@ -1854,6 +1861,10 @@
if (cache) {
_cache_flush_locked(cache);
}
+
+ // Also clear the NS statistics.
+ struct resolv_cache_info* cache_info = _find_cache_info_locked(netid);
+ _res_cache_clear_stats_locked(cache_info);
}
void _resolv_delete_cache_for_net(unsigned netid)
@@ -1928,8 +1939,16 @@
}
void
+_resolv_set_default_params(struct __res_params* params) {
+ params->sample_validity = NSSAMPLE_VALIDITY;
+ params->success_threshold = SUCCESS_THRESHOLD;
+ params->min_samples = 0;
+ params->max_samples = 0;
+}
+
+void
_resolv_set_nameservers_for_net(unsigned netid, const char** servers, int numservers,
- const char *domains)
+ const char *domains, const struct __res_params* params)
{
int i, rt, index;
struct addrinfo hints;
@@ -1945,54 +1964,76 @@
struct resolv_cache_info* cache_info = _find_cache_info_locked(netid);
- if (cache_info != NULL &&
- !_resolv_is_nameservers_equal_locked(cache_info, servers, numservers)) {
- // free current before adding new
- _free_nameservers_locked(cache_info);
-
- memset(&hints, 0, sizeof(hints));
- hints.ai_family = PF_UNSPEC;
- hints.ai_socktype = SOCK_DGRAM; /*dummy*/
- hints.ai_flags = AI_NUMERICHOST;
- snprintf(sbuf, sizeof(sbuf), "%u", NAMESERVER_PORT);
-
- index = 0;
- for (i = 0; i < numservers && i < MAXNS; i++) {
- rt = getaddrinfo(servers[i], sbuf, &hints, &cache_info->nsaddrinfo[index]);
- if (rt == 0) {
- cache_info->nameservers[index] = strdup(servers[i]);
- index++;
- XLOG("%s: netid = %u, addr = %s\n", __FUNCTION__, netid, servers[i]);
- } else {
- cache_info->nsaddrinfo[index] = NULL;
- }
+ if (cache_info != NULL) {
+ uint8_t old_max_samples = cache_info->params.max_samples;
+ if (params != NULL) {
+ cache_info->params = *params;
+ } else {
+ _resolv_set_default_params(&cache_info->params);
}
- // code moved from res_init.c, load_domain_search_list
- strlcpy(cache_info->defdname, domains, sizeof(cache_info->defdname));
- if ((cp = strchr(cache_info->defdname, '\n')) != NULL)
- *cp = '\0';
- cp = cache_info->defdname;
- offset = cache_info->dnsrch_offset;
- while (offset < cache_info->dnsrch_offset + MAXDNSRCH) {
- while (*cp == ' ' || *cp == '\t') /* skip leading white space */
- cp++;
- if (*cp == '\0') /* stop if nothing more to do */
- break;
- *offset++ = cp - cache_info->defdname; /* record this search domain */
- while (*cp) { /* zero-terminate it */
- if (*cp == ' '|| *cp == '\t') {
- *cp++ = '\0';
- break;
+ if (!_resolv_is_nameservers_equal_locked(cache_info, servers, numservers)) {
+ // free current before adding new
+ _free_nameservers_locked(cache_info);
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = PF_UNSPEC;
+ hints.ai_socktype = SOCK_DGRAM; /*dummy*/
+ hints.ai_flags = AI_NUMERICHOST;
+ snprintf(sbuf, sizeof(sbuf), "%u", NAMESERVER_PORT);
+
+ index = 0;
+ for (i = 0; i < numservers && i < MAXNS; i++) {
+ rt = getaddrinfo(servers[i], sbuf, &hints, &cache_info->nsaddrinfo[index]);
+ if (rt == 0) {
+ cache_info->nameservers[index] = strdup(servers[i]);
+ index++;
+ XLOG("%s: netid = %u, addr = %s\n", __FUNCTION__, netid, servers[i]);
+ } else {
+ cache_info->nsaddrinfo[index] = NULL;
}
- cp++;
}
- }
- *offset = -1; /* cache_info->dnsrch_offset has MAXDNSRCH+1 items */
- // flush cache since new settings
- _flush_cache_for_net_locked(netid);
+ // code moved from res_init.c, load_domain_search_list
+ strlcpy(cache_info->defdname, domains, sizeof(cache_info->defdname));
+ if ((cp = strchr(cache_info->defdname, '\n')) != NULL)
+ *cp = '\0';
+ cp = cache_info->defdname;
+ offset = cache_info->dnsrch_offset;
+ while (offset < cache_info->dnsrch_offset + MAXDNSRCH) {
+ while (*cp == ' ' || *cp == '\t') /* skip leading white space */
+ cp++;
+ if (*cp == '\0') /* stop if nothing more to do */
+ break;
+ *offset++ = cp - cache_info->defdname; /* record this search domain */
+ while (*cp) { /* zero-terminate it */
+ if (*cp == ' '|| *cp == '\t') {
+ *cp++ = '\0';
+ break;
+ }
+ cp++;
+ }
+ }
+ *offset = -1; /* cache_info->dnsrch_offset has MAXDNSRCH+1 items */
+
+ // Flush the cache and reset the stats.
+ _flush_cache_for_net_locked(netid);
+
+ // increment the revision id to ensure that sample state is not written back if the
+ // servers change; in theory it would suffice to do so only if the servers or
+ // max_samples actually change, in practice the overhead of checking is higher than the
+ // cost, and overflows are unlikely
+ ++cache_info->revision_id;
+ } else if (cache_info->params.max_samples != old_max_samples) {
+ // If the maximum number of samples changes, the overhead of keeping the most recent
+ // samples around is not considered worth the effort, so they are cleared instead. All
+ // other parameters do not affect shared state: Changing these parameters does not
+ // invalidate the samples, as they only affect aggregation and the conditions under which
+ // servers are considered usable.
+ _res_cache_clear_stats_locked(cache_info);
+ ++cache_info->revision_id;
+ }
}
pthread_mutex_unlock(&_res_cache_list_lock);
@@ -2049,7 +2090,11 @@
freeaddrinfo(cache_info->nsaddrinfo[i]);
cache_info->nsaddrinfo[i] = NULL;
}
+ cache_info->nsstats[i].sample_count =
+ cache_info->nsstats[i].sample_next = 0;
}
+ _res_cache_clear_stats_locked(cache_info);
+ ++cache_info->revision_id;
}
void
@@ -2103,3 +2148,65 @@
}
pthread_mutex_unlock(&_res_cache_list_lock);
}
+
+/* Resolver reachability statistics. */
+
+static void
+_res_cache_add_stats_sample_locked(struct __res_stats* stats, const struct __res_sample* sample,
+ int max_samples) {
+ // Note: This function expects max_samples > 0, otherwise a (harmless) modification of the
+ // allocated but supposedly unused memory for samples[0] will happen
+ XLOG("%s: adding sample to stats, next = %d, count = %d", __FUNCTION__,
+ stats->sample_next, stats->sample_count);
+ stats->samples[stats->sample_next] = *sample;
+ if (stats->sample_count < max_samples) {
+ ++stats->sample_count;
+ }
+ if (++stats->sample_next >= max_samples) {
+ stats->sample_next = 0;
+ }
+}
+
+static void
+_res_cache_clear_stats_locked(struct resolv_cache_info* cache_info) {
+ if (cache_info) {
+ for (int i = 0 ; i < MAXNS ; ++i) {
+ cache_info->nsstats->sample_count = cache_info->nsstats->sample_next = 0;
+ }
+ }
+}
+
+int
+_resolv_cache_get_resolver_stats( unsigned netid, struct __res_params* params,
+ struct __res_stats stats[MAXNS]) {
+
+ int revision_id = -1;
+ pthread_mutex_lock(&_res_cache_list_lock);
+
+ struct resolv_cache_info* info = _find_cache_info_locked(netid);
+ if (info) {
+ memcpy(stats, info->nsstats, sizeof(info->nsstats));
+ *params = info->params;
+ revision_id = info->revision_id;
+ }
+
+ pthread_mutex_unlock(&_res_cache_list_lock);
+ return revision_id;
+}
+
+void
+_resolv_cache_add_resolver_stats_sample( unsigned netid, int revision_id, int ns,
+ const struct __res_sample* sample, int max_samples) {
+ if (max_samples <= 0) return;
+
+ pthread_mutex_lock(&_res_cache_list_lock);
+
+ struct resolv_cache_info* info = _find_cache_info_locked(netid);
+
+ if (info && info->revision_id == revision_id) {
+ _res_cache_add_stats_sample_locked(&info->nsstats[ns], sample, max_samples);
+ }
+
+ pthread_mutex_unlock(&_res_cache_list_lock);
+}
+
diff --git a/libc/dns/resolv/res_send.c b/libc/dns/resolv/res_send.c
index 3458e48..bfc6e1a 100644
--- a/libc/dns/resolv/res_send.c
+++ b/libc/dns/resolv/res_send.c
@@ -81,9 +81,6 @@
#endif
#endif /* LIBC_SCCS and not lint */
-/* set to 1 to use our small/simple/limited DNS cache */
-#define USE_RESOLV_CACHE 1
-
/*
* Send query to name server and wait for reply.
*/
@@ -116,9 +113,7 @@
#include <isc/eventlib.h>
-#if USE_RESOLV_CACHE
-# include <resolv_cache.h>
-#endif
+#include <resolv_cache.h>
#include "private/libc_logging.h"
@@ -133,6 +128,7 @@
#endif
#include "res_debug.h"
#include "res_private.h"
+#include "resolv_stats.h"
#define EXT(res) ((res)->_u._ext)
#define DBG 0
@@ -144,10 +140,12 @@
static int get_salen __P((const struct sockaddr *));
static struct sockaddr * get_nsaddr __P((res_state, size_t));
static int send_vc(res_state, const u_char *, int,
- u_char *, int, int *, int);
+ u_char *, int, int *, int,
+ time_t *, int *, int *);
static int send_dg(res_state, const u_char *, int,
u_char *, int, int *, int,
- int *, int *);
+ int *, int *,
+ time_t *, int *, int *);
static void Aerror(const res_state, FILE *, const char *, int,
const struct sockaddr *, int);
static void Perror(const res_state, FILE *, const char *, int);
@@ -359,23 +357,13 @@
return (1);
}
-
int
res_nsend(res_state statp,
const u_char *buf, int buflen, u_char *ans, int anssiz)
{
int gotsomewhere, terrno, try, v_circuit, resplen, ns, n;
char abuf[NI_MAXHOST];
-#if USE_RESOLV_CACHE
- ResolvCacheStatus cache_status = RESOLV_CACHE_UNSUPPORTED;
-#endif
-
-#if !USE_RESOLV_CACHE
- if (statp->nscount == 0) {
- errno = ESRCH;
- return (-1);
- }
-#endif
+ ResolvCacheStatus cache_status = RESOLV_CACHE_UNSUPPORTED;
if (anssiz < HFIXEDSZ) {
errno = EINVAL;
@@ -387,7 +375,6 @@
gotsomewhere = 0;
terrno = ETIMEDOUT;
-#if USE_RESOLV_CACHE
int anslen = 0;
cache_status = _resolv_cache_lookup(
statp->netid, buf, buflen,
@@ -400,7 +387,6 @@
// data so the normal resolve path can do its thing
_resolv_populate_res_for_net(statp);
}
-
if (statp->nscount == 0) {
// We have no nameservers configured, so there's no point trying.
// Tell the cache the query failed, or any retries and anyone else asking the same
@@ -409,7 +395,6 @@
errno = ESRCH;
return (-1);
}
-#endif
/*
* If the ns_addr_list in the resolver context has changed, then
@@ -420,9 +405,9 @@
struct sockaddr_storage peer;
socklen_t peerlen;
- if (EXT(statp).nscount != statp->nscount)
+ if (EXT(statp).nscount != statp->nscount) {
needclose++;
- else
+ } else {
for (ns = 0; ns < statp->nscount; ns++) {
if (statp->nsaddr_list[ns].sin_family &&
!sock_eq((struct sockaddr *)(void *)&statp->nsaddr_list[ns],
@@ -445,6 +430,7 @@
break;
}
}
+ }
if (needclose) {
res_nclose(statp);
EXT(statp).nscount = 0;
@@ -485,7 +471,7 @@
nstime = EXT(statp).nstimes[0];
for (ns = 0; ns < lastns; ns++) {
if (EXT(statp).ext != NULL)
- EXT(statp).ext->nsaddrs[ns] =
+ EXT(statp).ext->nsaddrs[ns] =
EXT(statp).ext->nsaddrs[ns + 1];
statp->nsaddr_list[ns] = statp->nsaddr_list[ns + 1];
EXT(statp).nssocks[ns] = EXT(statp).nssocks[ns + 1];
@@ -502,13 +488,24 @@
* Send request, RETRY times, or until successful.
*/
for (try = 0; try < statp->retry; try++) {
+ struct __res_stats stats[MAXNS];
+ struct __res_params params;
+ int revision_id = _resolv_cache_get_resolver_stats(statp->netid, ¶ms, stats);
+ bool usable_servers[MAXNS];
+ _res_stats_get_usable_servers(¶ms, stats, statp->nscount, usable_servers);
+
for (ns = 0; ns < statp->nscount; ns++) {
+ if (!usable_servers[ns]) continue;
struct sockaddr *nsap;
int nsaplen;
+ time_t now = 0;
+ int rcode = RCODE_INTERNAL_ERROR;
+ int delay = 0;
nsap = get_nsaddr(statp, (size_t)ns);
nsaplen = get_salen(nsap);
statp->_flags &= ~RES_F_LASTMASK;
statp->_flags |= (ns << RES_F_LASTSHIFT);
+
same_ns:
if (statp->qhook) {
int done = 0, loops = 0;
@@ -552,7 +549,12 @@
try = statp->retry;
n = send_vc(statp, buf, buflen, ans, anssiz, &terrno,
- ns);
+ ns, &now, &rcode, &delay);
+
+ struct __res_sample sample;
+ _res_stats_set_sample(&sample, now, rcode, delay);
+ _resolv_cache_add_resolver_stats_sample(statp->netid, revision_id, ns,
+ &sample, params.max_samples);
if (DBG) {
__libc_format_log(ANDROID_LOG_DEBUG, "libc",
@@ -571,7 +573,13 @@
}
n = send_dg(statp, buf, buflen, ans, anssiz, &terrno,
- ns, &v_circuit, &gotsomewhere);
+ ns, &v_circuit, &gotsomewhere, &now, &rcode, &delay);
+
+ struct __res_sample sample;
+ _res_stats_set_sample(&sample, now, rcode, delay);
+ _resolv_cache_add_resolver_stats_sample(statp->netid, revision_id, ns,
+ &sample, params.max_samples);
+
if (DBG) {
__libc_format_log(ANDROID_LOG_DEBUG, "libc", "used send_dg %d\n",n);
}
@@ -582,7 +590,7 @@
goto next_ns;
if (DBG) {
__libc_format_log(ANDROID_LOG_DEBUG, "libc", "time=%ld\n",
- time(NULL));
+ time(NULL));
}
if (v_circuit)
goto same_ns;
@@ -599,12 +607,10 @@
(stdout, "%s", ""),
ans, (resplen > anssiz) ? anssiz : resplen);
-#if USE_RESOLV_CACHE
- if (cache_status == RESOLV_CACHE_NOTFOUND) {
- _resolv_cache_add(statp->netid, buf, buflen,
- ans, resplen);
- }
-#endif
+ if (cache_status == RESOLV_CACHE_NOTFOUND) {
+ _resolv_cache_add(statp->netid, buf, buflen,
+ ans, resplen);
+ }
/*
* If we have temporarily opened a virtual circuit,
* or if we haven't been asked to keep a socket open,
@@ -656,15 +662,12 @@
} else
errno = terrno;
-#if USE_RESOLV_CACHE
- _resolv_cache_query_failed(statp->netid, buf, buflen);
-#endif
+ _resolv_cache_query_failed(statp->netid, buf, buflen);
return (-1);
fail:
-#if USE_RESOLV_CACHE
+
_resolv_cache_query_failed(statp->netid, buf, buflen);
-#endif
res_nclose(statp);
return (-1);
}
@@ -735,8 +738,11 @@
static int
send_vc(res_state statp,
const u_char *buf, int buflen, u_char *ans, int anssiz,
- int *terrno, int ns)
+ int *terrno, int ns, time_t* at, int* rcode, int* delay)
{
+ *at = 0;
+ *rcode = RCODE_INTERNAL_ERROR;
+ *delay = 0;
const HEADER *hp = (const HEADER *)(const void *)buf;
HEADER *anhp = (HEADER *)(void *)ans;
struct sockaddr *nsap;
@@ -758,6 +764,8 @@
same_ns:
truncating = 0;
+ struct timespec now = evNowTime();
+
/* Are we still talking to whom we want to talk to? */
if (statp->_vcsock >= 0 && (statp->_flags & RES_F_VC) != 0) {
struct sockaddr_storage peer;
@@ -800,7 +808,7 @@
}
if (statp->_mark != MARK_UNSET) {
if (setsockopt(statp->_vcsock, SOL_SOCKET,
- SO_MARK, &statp->_mark, sizeof(statp->_mark)) < 0) {
+ SO_MARK, &statp->_mark, sizeof(statp->_mark)) < 0) {
*terrno = errno;
Perror(statp, stderr, "setsockopt", errno);
return -1;
@@ -820,6 +828,15 @@
Aerror(statp, stderr, "connect/vc", errno, nsap,
nsaplen);
res_nclose(statp);
+ /*
+ * The way connect_with_timeout() is implemented prevents us from reliably
+ * determining whether this was really a timeout or e.g. ECONNREFUSED. Since
+ * currently both cases are handled in the same way, there is no need to
+ * change this (yet). If we ever need to reliably distinguish between these
+ * cases, both connect_with_timeout() and retrying_select() need to be
+ * modified, though.
+ */
+ *rcode = RCODE_TIMEOUT;
return (0);
}
statp->_flags |= RES_F_VC;
@@ -900,6 +917,10 @@
res_nclose(statp);
return (0);
}
+
+ struct timespec done = evNowTime();
+ *at = done.tv_sec;
+
if (truncating) {
/*
* Flush rest of answer so connection stays in synch.
@@ -936,6 +957,10 @@
* All is well, or the error is fatal. Signal that the
* next nameserver ought not be tried.
*/
+ if (resplen > 0) {
+ *delay = _res_stats_calculate_rtt(&done, &now);
+ *rcode = anhp->rcode;
+ }
return (resplen);
}
@@ -952,8 +977,8 @@
res = __connect(sock, nsap, salen);
if (res < 0 && errno != EINPROGRESS) {
- res = -1;
- goto done;
+ res = -1;
+ goto done;
}
if (res != 0) {
now = evNowTime();
@@ -965,7 +990,7 @@
res = retrying_select(sock, &rset, &wset, &finish);
if (res <= 0) {
- res = -1;
+ res = -1;
}
}
done:
@@ -987,7 +1012,7 @@
retry:
if (DBG) {
- __libc_format_log(ANDROID_LOG_DEBUG, "libc", " %d retying_select\n", sock);
+ __libc_format_log(ANDROID_LOG_DEBUG, "libc", " %d retrying_select\n", sock);
}
now = evNowTime();
@@ -1042,17 +1067,20 @@
return n;
}
-
static int
send_dg(res_state statp,
const u_char *buf, int buflen, u_char *ans, int anssiz,
- int *terrno, int ns, int *v_circuit, int *gotsomewhere)
+ int *terrno, int ns, int *v_circuit, int *gotsomewhere,
+ time_t *at, int *rcode, int* delay)
{
+ *at = 0;
+ *rcode = RCODE_INTERNAL_ERROR;
+ *delay = 0;
const HEADER *hp = (const HEADER *)(const void *)buf;
HEADER *anhp = (HEADER *)(void *)ans;
const struct sockaddr *nsap;
int nsaplen;
- struct timespec now, timeout, finish;
+ struct timespec now, timeout, finish, done;
fd_set dsmask;
struct sockaddr_storage from;
socklen_t fromlen;
@@ -1145,6 +1173,7 @@
n = retrying_select(s, &dsmask, NULL, &finish);
if (n == 0) {
+ *rcode = RCODE_TIMEOUT;
Dprint(statp->options & RES_DEBUG, (stdout, ";; timeout\n"));
*gotsomewhere = 1;
return (0);
@@ -1230,6 +1259,9 @@
ans, (resplen > anssiz) ? anssiz : resplen);
goto retry;;
}
+ done = evNowTime();
+ *at = done.tv_sec;
+ *delay = _res_stats_calculate_rtt(&done, &now);
if (anhp->rcode == SERVFAIL ||
anhp->rcode == NOTIMP ||
anhp->rcode == REFUSED) {
@@ -1238,8 +1270,10 @@
ans, (resplen > anssiz) ? anssiz : resplen);
res_nclose(statp);
/* don't retry if called from dig */
- if (!statp->pfcode)
+ if (!statp->pfcode) {
+ *rcode = anhp->rcode;
return (0);
+ }
}
if (!(statp->options & RES_IGNTC) && anhp->tc) {
/*
@@ -1256,6 +1290,9 @@
* All is well, or the error is fatal. Signal that the
* next nameserver ought not be tried.
*/
+ if (resplen > 0) {
+ *rcode = anhp->rcode;
+ }
return (resplen);
}
diff --git a/libc/dns/resolv/res_stats.c b/libc/dns/resolv/res_stats.c
new file mode 100644
index 0000000..a3d6f36
--- /dev/null
+++ b/libc/dns/resolv/res_stats.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <arpa/nameser.h>
+#include <string.h>
+
+#include "resolv_stats.h"
+#include "private/libc_logging.h"
+#include "isc/eventlib.h"
+
+#define DBG 1
+
+/* Calculate the round-trip-time from start time t0 and end time t1. */
+int
+_res_stats_calculate_rtt(const struct timespec* t1, const struct timespec* t0) {
+ // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
+ long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
+ long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
+ return (int) (ms1 - ms0);
+}
+
+/* Create a sample for calculating server reachability statistics. */
+void
+_res_stats_set_sample(struct __res_sample* sample, time_t now, int rcode, int rtt)
+{
+ if (DBG) {
+ __libc_format_log(ANDROID_LOG_INFO, "libc", "rcode = %d, sec = %d", rcode, rtt);
+ }
+ sample->at = now;
+ sample->rcode = rcode;
+ sample->rtt = rtt;
+}
+
+/* Clears all stored samples for the given server. */
+void
+_res_stats_clear_samples(struct __res_stats* stats)
+{
+ stats->sample_count = stats->sample_next = 0;
+}
+
+/* Aggregates the reachability statistics for the given server based on on the stored samples. */
+void
+_res_stats_aggregate(struct __res_stats* stats, int* successes, int* errors, int* timeouts,
+ int* internal_errors, int* rtt_avg, time_t* last_sample_time)
+{
+ int s = 0; // successes
+ int e = 0; // errors
+ int t = 0; // timouts
+ int ie = 0; // internal errors
+ long rtt_sum = 0;
+ time_t last = 0;
+ int rtt_count = 0;
+ for (int i = 0 ; i < stats->sample_count ; ++i) {
+ // Treat everything as an error that the code in send_dg() already considers a
+ // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
+ // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
+ // is not treated as an error here either. FORMERR seems to sometimes be returned by
+ // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
+ // as an indication of a broken server is unclear, though. For now treat such responses,
+ // as well as unknown codes as errors.
+ switch (stats->samples[i].rcode) {
+ case NOERROR:
+ case NOTAUTH:
+ case NXDOMAIN:
+ ++s;
+ rtt_sum += stats->samples[i].rtt;
+ ++rtt_count;
+ break;
+ case RCODE_TIMEOUT:
+ ++t;
+ break;
+ case RCODE_INTERNAL_ERROR:
+ ++ie;
+ break;
+ case SERVFAIL:
+ case NOTIMP:
+ case REFUSED:
+ default:
+ ++e;
+ break;
+ }
+ }
+ *successes = s;
+ *errors = e;
+ *timeouts = t;
+ *internal_errors = ie;
+ /* If there was at least one successful sample, calculate average RTT. */
+ if (rtt_count) {
+ *rtt_avg = rtt_sum / rtt_count;
+ } else {
+ *rtt_avg = -1;
+ }
+ /* If we had at least one sample, populate last sample time. */
+ if (stats->sample_count > 0) {
+ if (stats->sample_next > 0) {
+ last = stats->samples[stats->sample_next - 1].at;
+ } else {
+ last = stats->samples[stats->sample_count - 1].at;
+ }
+ }
+ *last_sample_time = last;
+}
+
+bool
+_res_stats_usable_server(const struct __res_params* params, struct __res_stats* stats) {
+ int successes = -1;
+ int errors = -1;
+ int timeouts = -1;
+ int internal_errors = -1;
+ int rtt_avg = -1;
+ time_t last_sample_time = 0;
+ _res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors, &rtt_avg,
+ &last_sample_time);
+ if (successes >= 0 && errors >= 0 && timeouts >= 0) {
+ int total = successes + errors + timeouts;
+ if (DBG) {
+ __libc_format_log(ANDROID_LOG_DEBUG, "libc", "NS stats: S %d + E %d + T %d + I %d "
+ "= %d, rtt = %d, min_samples = %d\n", successes, errors, timeouts, internal_errors,
+ total, rtt_avg, params->min_samples);
+ }
+ if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
+ int success_rate = successes * 100 / total;
+ if (DBG) {
+ __libc_format_log(ANDROID_LOG_DEBUG, "libc", "success rate %d%%\n", success_rate);
+ }
+ if (success_rate < params->success_threshold) {
+ // evNowTime() is used here instead of time() to stay consistent with the rest of
+ // the code base
+ time_t now = evNowTime().tv_sec;
+ if (now - last_sample_time > params->sample_validity) {
+ // Note: It might be worth considering to expire old servers after their expiry
+ // date has been reached, however the code for returning the ring buffer to its
+ // previous non-circular state would induce additional complexity.
+ if (DBG) {
+ __libc_format_log(ANDROID_LOG_INFO, "libc",
+ "samples stale, retrying server\n");
+ }
+ _res_stats_clear_samples(stats);
+ } else {
+ if (DBG) {
+ __libc_format_log(ANDROID_LOG_INFO, "libc",
+ "too many resolution errors, ignoring server\n");
+ }
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+void
+_res_stats_get_usable_servers(const struct __res_params* params, struct __res_stats stats[MAXNS],
+ int nscount, bool usable_servers[MAXNS]) {
+ unsigned usable_servers_found = 0;
+ for (int ns = 0; ns < nscount; ns++) {
+ bool usable = _res_stats_usable_server(params, &stats[ns]);
+ if (usable) {
+ ++usable_servers_found;
+ }
+ usable_servers[ns] = usable;
+ }
+ // If there are no usable servers, consider all of them usable.
+ // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
+ if (usable_servers_found == 0) {
+ for (int ns = 0; ns < nscount; ns++) {
+ usable_servers[ns] = true;
+ }
+ }
+}