Skip to content

Commit c3afad9

Browse files
committed
Fix logging performance issues in Elasticsearch v2. rsyslog improvements
After upgrading to elasticsearch v2, some of our logging tests were consistently failing if they happened to follow a number of other tests. The issue was that if a test previously ran that made a bunch of requests, then Elasticsearch wasn't indexing the data quickly enough, causing rsyslog queue to get backed up. This resulted in the requests from the logging tests not getting indexed into Elasticsearch within the expected time. There were 2 main performance issues with Elasticsearch v2 that led to this: 1. Elasticsearch syncs the data to disk on every request in version 2, which hurts indexing performance. We've switched things back v1's mode of performing asynchronous syncs periodically (so we're trading some safety for speed, but that seems okay for this kind of log data). 2. Updating index mappings in Elasticsearch version 2 is more costly. Our mapping is mostly static unless you enable logging of the "request_query" field, which stores all the request query params as a nested object. We have some tests that generate a bunch of unique, random query parameters (mainly for cache busting), but this leads to a deluge of mapping updates since each new query param seen means the mapping needs to be updated. We recently disabled gathering this "request_query" field by default, but it was still enabled in our test suite by default, since we had some existing tests that relied on this functionality. So to solve the performance issues, we've shifted our test suite to disable gathering "request_query" too. This eliminates all the mapping updates during tests. The ability to enabled "request_query" collection still exists, and the existing tests for this functionality have been retained (just in a more isolated fashion that won't impact other tests). However, given the potential performance issues of enabling this, it might be a good reason to get rid of this functionality altogether. While debugging the performance issues, we've also made a couple tweaks and improvements to our rsyslog setup: - The queue.size wasn't configured, so the memory-portion of the queue was capped at 1,000 by default. This meant the configured highwater and lowwater sizes weren't actually being used. Set a higher queue.size to resolve this. - Enable the impstats plugin to output rsyslog queue stats every minute. This seems generally helpful to have in place to be able to see what rsyslog's up to and whether anything is becoming congested with logging. - More comments to explain some of the more cryptic rsyslog configuration settings.
1 parent 0c981a8 commit c3afad9

File tree

9 files changed

+434
-158
lines changed

9 files changed

+434
-158
lines changed

build/cmake/rsyslog.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ list(APPEND RSYSLOG_CONFIGURE_CMD --prefix=${INSTALL_PREFIX_EMBEDDED})
9191
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-liblogging-stdlog)
9292
list(APPEND RSYSLOG_CONFIGURE_CMD --disable-libgcrypt)
9393
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-imptcp)
94+
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-impstats)
9495
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-mmjsonparse)
9596
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-mmutf8fix)
9697
list(APPEND RSYSLOG_CONFIGURE_CMD --enable-elasticsearch)

config/default.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,13 @@ elasticsearch:
162162
breaker:
163163
fielddata:
164164
limit: 60%
165+
index:
166+
translog:
167+
# Sync the data to disk asynchronously on a fixed interval, rather than
168+
# for every request. This significantly helps indexing throughput (at
169+
# the risk of losing a few seconds of data if things crash).
170+
durability: async
171+
sync_interval: 10s
165172
analytics:
166173
adapter: elasticsearch
167174
timezone: UTC

config/elasticsearch_templates.json

Lines changed: 114 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -81,66 +81,162 @@
8181
"type": "string",
8282
"index": "not_analyzed"
8383
},
84-
"user_id": {
84+
"backend_response_time": {
85+
"type": "integer"
86+
},
87+
"gatekeeper_denied_code": {
8588
"type": "string",
86-
"index": "not_analyzed"
89+
"analyzer": "keyword_lowercase"
90+
},
91+
"internal_gatekeeper_time": {
92+
"type": "float"
93+
},
94+
"internal_response_time": {
95+
"type": "float"
96+
},
97+
"proxy_overhead": {
98+
"type": "integer"
99+
},
100+
"request_accept": {
101+
"type": "string",
102+
"analyzer": "keyword_lowercase"
103+
},
104+
"request_accept_encoding": {
105+
"type": "string",
106+
"analyzer": "keyword_lowercase"
87107
},
88108
"request_at": {
89109
"type": "date"
90110
},
111+
"request_basic_auth_username": {
112+
"type": "string",
113+
"analyzer": "keyword_lowercase"
114+
},
115+
"request_connection": {
116+
"type": "string",
117+
"analyzer": "keyword_lowercase"
118+
},
119+
"request_content_type": {
120+
"type": "string",
121+
"analyzer": "keyword_lowercase"
122+
},
123+
"request_hierarchy": {
124+
"type": "string",
125+
"analyzer": "keyword_lowercase"
126+
},
127+
"request_host": {
128+
"type": "string",
129+
"analyzer": "keyword_lowercase"
130+
},
91131
"request_ip": {
92132
"type": "string",
93133
"index": "not_analyzed"
94134
},
135+
"request_ip_city": {
136+
"type": "string",
137+
"index": "not_analyzed"
138+
},
95139
"request_ip_country": {
96140
"type": "string",
97141
"index": "not_analyzed"
98142
},
143+
"request_ip_location": {
144+
"type": "geo_point",
145+
"lat_lon": true
146+
},
99147
"request_ip_region": {
100148
"type": "string",
101149
"index": "not_analyzed"
102150
},
103-
"request_ip_city": {
151+
"request_method": {
104152
"type": "string",
105-
"index": "not_analyzed"
153+
"analyzer": "keyword_lowercase"
154+
},
155+
"request_origin": {
156+
"type": "string",
157+
"analyzer": "keyword_lowercase"
158+
},
159+
"request_path": {
160+
"type": "string",
161+
"analyzer": "keyword_lowercase"
106162
},
107163
"request_path_hierarchy": {
108164
"type": "string",
109165
"analyzer": "path_hierarchy_lowercase"
110166
},
167+
"request_referer": {
168+
"type": "string",
169+
"analyzer": "keyword_lowercase"
170+
},
171+
"request_scheme": {
172+
"type": "string",
173+
"analyzer": "keyword_lowercase"
174+
},
111175
"request_size": {
112176
"type": "integer"
113177
},
114-
"response_status": {
115-
"type": "short"
178+
"request_url": {
179+
"type": "string",
180+
"analyzer": "keyword_lowercase"
181+
},
182+
"request_user_agent": {
183+
"type": "string",
184+
"analyzer": "keyword_lowercase"
185+
},
186+
"request_user_agent_family": {
187+
"type": "string",
188+
"analyzer": "keyword_lowercase"
189+
},
190+
"request_user_agent_type": {
191+
"type": "string",
192+
"analyzer": "keyword_lowercase"
116193
},
117194
"response_age": {
118195
"type": "integer"
119196
},
197+
"response_cache": {
198+
"type": "string",
199+
"analyzer": "keyword_lowercase"
200+
},
201+
"response_content_encoding": {
202+
"type": "string",
203+
"analyzer": "keyword_lowercase"
204+
},
120205
"response_content_length": {
121206
"type": "integer"
122207
},
208+
"response_content_type": {
209+
"type": "string",
210+
"analyzer": "keyword_lowercase"
211+
},
212+
"response_server": {
213+
"type": "string",
214+
"analyzer": "keyword_lowercase"
215+
},
123216
"response_size": {
124217
"type": "integer"
125218
},
126-
"response_time": {
127-
"type": "integer"
219+
"response_status": {
220+
"type": "short"
128221
},
129-
"backend_response_time": {
222+
"response_time": {
130223
"type": "integer"
131224
},
132-
"proxy_overhead": {
133-
"type": "integer"
225+
"response_transfer_encoding": {
226+
"type": "string",
227+
"analyzer": "keyword_lowercase"
134228
},
135-
"internal_response_time": {
136-
"type": "float"
229+
"user_email": {
230+
"type": "string",
231+
"analyzer": "keyword_lowercase"
137232
},
138-
"internal_gatekeeper_time": {
139-
"type": "float"
233+
"user_id": {
234+
"type": "string",
235+
"index": "not_analyzed"
140236
},
141-
"request_ip_location": {
142-
"type": "geo_point",
143-
"lat_lon": true
237+
"user_registration_source": {
238+
"type": "string",
239+
"analyzer": "keyword_lowercase"
144240
}
145241
}
146242
}

config/test.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ http_port: 9080
55
https_port: 9081
66
analytics:
77
timezone: America/Denver
8-
log_request_url_query_params_separately: true
98
static_site:
109
port: 13013
1110
nginx:

templates/etc/rsyslog.conf.mustache

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ global(maxMessageSize="32k")
44

55
global(workDirectory="{{db_dir}}/rsyslog")
66

7+
# Output queue statistics periodically so the health of the queue can be
8+
# inspected.
9+
module(
10+
load="impstats"
11+
interval="60"
12+
resetCounters="on"
13+
log.syslog="off"
14+
log.file="{{log_dir}}/rsyslog/stats.log"
15+
)
16+
717
# Load Modules
818
module(load="imtcp" MaxSessions="500")
919
module(load="mmjsonparse")
@@ -160,36 +170,59 @@ template(name="elasticsearch-json-record" type="subtree" subtree="$!usr!es")
160170
# Output to ElasticSearch.
161171
# A disk-assisted memory queue is used for buffering.
162172
local0.info action(
173+
name="output-elasticsearch"
163174
type="omelasticsearch"
164175
server="{{elasticsearch._first_server.host}}"
165176
serverport="{{elasticsearch._first_server.port}}"
166177
searchIndex="elasticsearch-index"
167178
dynSearchIndex="on"
168179
searchType="log"
169180
template="elasticsearch-json-record"
181+
170182
# Enable bulk indexing, so batches of records are sent as a single HTTP
171183
# request.
172184
bulkmode="on"
173185
bulkId="elasticsearch-id"
174186
dynBulkId="on"
187+
188+
# Allow bulk indexing of batches *up to* this size.
189+
#
190+
# Note that as long as Elasticsearch is keeping up and data isn't being
191+
# queued by rsyslog, then rsyslog will send data as quickly as it can to
192+
# Elasticsearch (so the batch sizes might be much smaller). See
193+
# http://www.gossamer-threads.com/lists/rsyslog/users/17550
194+
queue.dequeuebatchsize="5000"
195+
175196
# Require indexing by all replica shards.
176197
asyncrepl="off"
198+
177199
# For the in-memory queue, use a linked-list (so the memory doesn't have to
178200
# be pre-allocated based on a fixed size).
179201
queue.type="LinkedList"
202+
180203
# Set a filename, so the queue is disk assisted. This allows for offloading
181204
# the data from the memory queue to disk if the queue becomes bigger than
182205
# expected.
183206
queue.filename="queue-elasticsearch"
207+
208+
# Set thresholds for when the memory queue is too big and should use the
209+
# disk (note the disk queue size is not bounded by the queue.size, that only
210+
# applies to the memory portion).
211+
queue.size="15000"
212+
queue.highwatermark="10000"
213+
queue.lowwatermark="2000"
214+
184215
# Persist data to disk on this interval (in seconds). We're okay with some
185216
# loss in the event of unexpected failures.
186217
queue.checkpointinterval="10"
218+
187219
# Persist data to disk on graceful shutdowns.
188220
queue.saveonshutdown="on"
189-
# Set thresholds for when the memory queue is too big and should use the
190-
# disk.
191-
queue.highwatermark="10000"
192-
queue.lowwatermark="2000"
221+
222+
# If Elasticsearch is inaccessible, retry on this interval (in seconds)
223+
# indefinitely (so we don't stop logging to Elasticsearch in case it goes
224+
# down for a longer period of time).
225+
action.resumeInterval="30"
193226
action.resumeRetryCount="-1"
194227
)
195228
{{/analytics._output_elasticsearch?}}
@@ -361,6 +394,7 @@ template(name="sql-json-record" type="subtree" subtree="$!usr!sql")
361394
# Output to Kafka.
362395
# A disk-assisted memory queue is used for buffering.
363396
local0.info action(
397+
name="output-kafka"
364398
type="omkafka"
365399
broker=[{{kafka._rsyslog_broker}}]
366400
topic="{{kafka.topic}}"
@@ -370,6 +404,7 @@ local0.info action(
370404
queue.filename="queue-kafka"
371405
queue.checkpointinterval="10"
372406
queue.saveonshutdown="on"
407+
queue.size="15000"
373408
queue.highwatermark="10000"
374409
queue.lowwatermark="2000"
375410
action.resumeRetryCount="-1"
@@ -381,13 +416,15 @@ template(name="all-json-record" type="list") {
381416
property(name="$!raw") constant(value="\n")
382417
}
383418
local0.info action(
419+
name="output-file"
384420
type="omfile"
385421
file="{{log_dir}}/rsyslog/requests.log.gz"
386422
template="all-json-record"
387423
queue.type="LinkedList"
388424
queue.filename="queue-file"
389425
queue.checkpointinterval="10"
390426
queue.saveonshutdown="on"
427+
queue.size="15000"
391428
queue.highwatermark="10000"
392429
queue.lowwatermark="2000"
393430
zipLevel="3"

0 commit comments

Comments
 (0)