[mongodb-user] Relatively small result set takes quite a while to calculate

Discussion:

Alex Paransky

2015-11-17 19:23:23 UTC

I am executing a query which is taking over 45 seconds to return (other
queries run quite fast, within 2 seconds):

db.getCollection("events-qos-loadstart").count(

{

"vd.ple.vd.acc" : "EY",

"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",

"vd.pec" : {"$gt" : 1}

}

)

The collection has a proper index that it appears to be using:

{

"v" : 1,

"key" : {

"vd.ple.vd.acc" : 1,

"vd.ple.vd.pid" : 1,

"vd.pec" : 1,

"vd.ts" : 1

},

"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",

"ns" : "cdc.events-qos-loadstart"

}

There are quite a bit of records in this collection (2,780,670), however,
the count above returns 384,848.

What can I check to determine why a simple count is taking over 45 seconds
to execute?

Here is the explain plain for the count (which shows that the index is
being used):

{

"queryPlanner" : {

"plannerVersion" : 1,

"namespace" : "cdc.events-qos-loadstart",

"indexFilterSet" : false,

"parsedQuery" : {

"$and" : [

{

"vd.ple.vd.acc" : {

"$eq" : "EY"

}

},

{

"vd.ple.vd.pid" : {

"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"

}

},

{

"vd.pec" : {

"$gt" : 1

}

}

]

},

"winningPlan" : {

"stage" : "COUNT",

"inputStage" : {

"stage" : "COUNT_SCAN",

"keyPattern" : {

"vd.ple.vd.acc" : 1,

"vd.ple.vd.pid" : 1,

"vd.pec" : 1,

"vd.ts" : 1

},

"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",

"isMultiKey" : false

}

},

"rejectedPlans" : [ ]

},

"executionStats" : {

"executionSuccess" : true,

"nReturned" : 0,

"executionTimeMillis" : 176,

"totalKeysExamined" : 385322,

"totalDocsExamined" : 0,

"executionStages" : {

"stage" : "COUNT",

"nReturned" : 0,

"executionTimeMillisEstimate" : 130,

"works" : 385322,

"advanced" : 0,

"needTime" : 385321,

"needFetch" : 0,

"saveState" : 3010,

"restoreState" : 3010,

"isEOF" : 1,

"invalidates" : 0,

"nCounted" : 385320,

"nSkipped" : 0,

"inputStage" : {

"stage" : "COUNT_SCAN",

"nReturned" : 385320,

"executionTimeMillisEstimate" : 120,

"works" : 385321,

"advanced" : 385320,

"needTime" : 1,

"needFetch" : 0,

"saveState" : 3010,

"restoreState" : 3010,

"isEOF" : 1,

"invalidates" : 0,

"keysExamined" : 385322,

"keyPattern" : {

"vd.ple.vd.acc" : 1,

"vd.ple.vd.pid" : 1,

"vd.pec" : 1,

"vd.ts" : 1

},

"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",

"isMultiKey" : false

}

},

"allPlansExecution" : [ ]

},

"serverInfo" : {

"host" : "HOST",

"port" : PORT,

"version" : "3.0.7",

"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"

},

"ok" : 1

}
--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/0dc6d862-dc82-40f0-badc-740e41407a7a%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Dwight Merriman

2015-11-17 19:48:39 UTC

Permalink

i'm a bit confused as the executionTimeMillis in the explain() output is
fast. Did the explain run quickly?

Btw are you using wildtiger or mmap storage engine? (not sure it matters
here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670), however,
the count above returns 384,848.
What can I check to determine why a simple count is taking over 45 seconds
to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/7ec4cecd-c89c-4925-a920-5e4cc06eaba9%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-17 20:47:30 UTC

Permalink

The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.

Here are more specs on this database instance:

{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}

Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain() output is
fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it matters
here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670), however,
the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/a3697d45-b3bd-4fb0-b8f3-a695f4ad56b4%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-17 20:51:03 UTC

Permalink

Here are the stats on the specific collection:

{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"creationString" :
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Asya Kamsky

2015-11-18 04:56:04 UTC

Permalink

Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?

It might help if you run the count a couple of times and see if the results
are more or less the same performance wise.

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups
"mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an
.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
Asya Kamsky
Lead Product Manager
MongoDB
Download MongoDB - mongodb.org/downloads
Free MongoDB Monitoring - cloud.mongodb.com
Free Online Education - university.mongodb.com
Get Involved - mongodb.org/community
We're Hiring! - https://www.mongodb.com/careers
--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/CAOe6dJDR2fpbx9sjn0Hwr8oNJ8Nn-jW4s7mTxGj%2BOPLzAhWEkA%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-18 15:49:03 UTC

Permalink

Asya,

I tried a few times, here are the results from the log:

Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
Database: { acquireCount: { r: 4540 } }, Collection: { acquireCount: { r:
4540 } } } 87137ms

Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
Database: { acquireCount: { r: 4542 } }, Collection: { acquireCount: { r:
4542 } } } 87745ms

Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
Database: { acquireCount: { r: 4540 } }, Collection: { acquireCount: { r:
4540 } } } 86816ms

Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
Database: { acquireCount: { r: 4268 } }, Collection: { acquireCount: { r:
4268 } } } 74458ms

The count returned is always 390936. So this data is not changing.

Slightly different version of this query with $eq instead of $gt returns
much faster:

Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $eq: 3.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
acquireWaitCount: { r: 7 }, timeAcquiringMicros: { r: 66666 } }, Database:
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms

The result of this count is: 1541

So it seems that doing an index scan is taking a long time?

Thanks for your help.

-AP_

Post by Asya Kamsky
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits
there for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain() output
is fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it
matters here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups
"mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/aa49fd3a-c455-47f1-8e9b-4dfd2037e848%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-19 18:08:56 UTC

Permalink

Do you think re-creating index will make a difference?

Post by Alex Paransky
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits
there for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain() output
is fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it
matters here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google
Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google
Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/5b477d05-028b-4caa-a448-bc03824949e9%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-21 07:41:31 UTC

Permalink

I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.

We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?

Thanks.
-AP_

Post by Alex Paransky
Do you think re-creating index will make a difference?

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits
there for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain() output
is fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it
matters here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google
Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google
Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/2c2a6b57-3672-485f-abb2-54adca53d4d6%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Asya Kamsky

2015-11-21 14:59:11 UTC

Permalink

First, no, I don't think recreating an index is going to do anything.

Second, your comment about read-only seems strange - if there is any
replication that means there are writes.

If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.

Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.

I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.

Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.

One thing you can do to see visually what's going on is this:

Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).

Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.

You also say "other queries run quite fast, within 2 seconds" but 2 seconds
is *not* very fast at all! If you're running into a subtle bug, then best
way to improve your performance would be to figure out what the bug is so
that we can fix it (for everyone, not just you :) ).

Asya

Post by Alex Paransky
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_

Post by Alex Paransky
Do you think re-creating index will make a difference?

Post by Alex Paransky
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits
there for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain()
output is fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it
matters here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the index
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google
Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google
Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

You received this message because you are subscribed to the Google Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups
"mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/2c2a6b57-3672-485f-abb2-54adca53d4d6%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/2c2a6b57-3672-485f-abb2-54adca53d4d6%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
Asya Kamsky
Lead Product Manager
MongoDB
Download MongoDB - mongodb.org/downloads
Free MongoDB Monitoring - cloud.mongodb.com
Free Online Education - university.mongodb.com
Get Involved - mongodb.org/community
We're Hiring! - https://www.mongodb.com/careers
--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/CAOe6dJCcSah6swY0%2BWLzYx1n95Z6L0aGysJfE%2BfKK5E46BpaJw%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-11-23 20:42:16 UTC

Permalink

You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.

I took the route of taking the server out of the replication, before doing
so, I ran a few commands again:

BEFORE RESTARTING SERVER WITHOUT REPLICATION:

Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
Database: { acquireCount: { r: 4147 } }, Collection: { acquireCount: { r:
4147 } } } 64663ms

Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
Database: { acquireCount: { r: 3709 } }, Collection: { acquireCount: { r:
3709 } } } 47897ms

Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
Database: { acquireCount: { r: 3488 } }, Collection: { acquireCount: { r:
3488 } } } 37515ms

AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
different port, commenting out the replication configuration):

Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms

Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms

Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms

Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms

AFTER STARTING THE SERVER BACK UP AS A SECONDARY:

Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
Database: { acquireCount: { r: 3059 } }, Collection: { acquireCount: { r:
3059 } } } 115ms

Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms

Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms

So, suddenly, things are running quite fast.

We don't have too much memory on this machine (it's a 15 gig box in EC2)...

total used free shared buffers cached

Mem: 15042 6911 8131 0 60 3663

-/+ buffers/cache: 3187 11855

Swap: 0 0 0

Total: 15042 6911 8131

We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.

I will monitor the machine some more. Could this be a memory leak?

-AP_

Post by Asya Kamsky
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya

Post by Alex Paransky
Do you think re-creating index will make a difference?

Post by Alex Paransky
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_

Post by Alex Paransky
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}

Post by Alex Paransky
The execution plan query came back very fast. The count query sits
there for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_

Post by Dwight Merriman
i'm a bit confused as the executionTimeMillis in the explain()
output is fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it
matters here but good to know)

Post by Alex Paransky
I am executing a query which is taking over 45 seconds to return
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670),
however, the count above returns 384,848.
What can I check to determine why a simple count is taking over 45
seconds to execute?
Here is the explain plain for the count (which shows that the
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 0,
"executionTimeMillis" : 176,
"totalKeysExamined" : 385322,
"totalDocsExamined" : 0,
"executionStages" : {
"stage" : "COUNT",
"nReturned" : 0,
"executionTimeMillisEstimate" : 130,
"works" : 385322,
"advanced" : 0,
"needTime" : 385321,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"nCounted" : 385320,
"nSkipped" : 0,
"inputStage" : {
"stage" : "COUNT_SCAN",
"nReturned" : 385320,
"executionTimeMillisEstimate" : 120,
"works" : 385321,
"advanced" : 385320,
"needTime" : 1,
"needFetch" : 0,
"saveState" : 3010,
"restoreState" : 3010,
"isEOF" : 1,
"invalidates" : 0,
"keysExamined" : 385322,
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"allPlansExecution" : [ ]
},
"serverInfo" : {
"host" : "HOST",
"port" : PORT,
"version" : "3.0.7",
"gitVersion" : "6ce7cbe8c6b899552dadd907604559806aa2e9bd"
},
"ok" : 1
}
--

You received this message because you are subscribed to the Google
Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google
Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it,
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/206ec962-317c-444c-87ae-a5c4b7cbe6e2%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

You received this message because you are subscribed to the Google Groups "mongodb-user"
group.
http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups
"mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an
<javascript:>.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit
https://groups.google.com/d/msgid/mongodb-user/2c2a6b57-3672-485f-abb2-54adca53d4d6%40googlegroups.com
<https://groups.google.com/d/msgid/mongodb-user/2c2a6b57-3672-485f-abb2-54adca53d4d6%40googlegroups.com?utm_medium=email&utm_source=footer>
.
For more options, visit https://groups.google.com/d/optout.

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/60d0eca9-cdd6-4395-bb3e-d7cbdeb6cdc5%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Asya Kamsky

2015-11-27 20:33:04 UTC

Permalink

Alex:

Very interesting results.

While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
before and after:

Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc:
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:*3487* reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
Database: { acquireCount: { r: 3488 } }, Collection: { acquireCount: { r:
3488 } } } 37515ms

Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: { acquireCount:
{ r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 }
}, Database: { acquireCount: { r: 3059 } }, Collection: { acquireCount: {
r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: { acquireCount:
{ r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms

Two *very* different looking lines, because they seem to be output a
different plan summary!

So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...

Would you check the version running now - best way to check is in the logs
- especially in the old log since that version may be harder to track down!

Asya

Post by Alex Paransky
You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before doing
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_
i'm a bit confused as the executionTimeMillis in the explain() output is
fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it matters
here but good to know)
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"name" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"ns" : "cdc.events-qos-loadstart"
}
There are quite a bit of records in this collection (2,780,670), however,
the count above returns 384,848.
What can I check to determine why a simple count is taking over 45 seconds
to execute?
Here is the explain plain for the count (which shows that the index is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "313c0296-5469-59f7-7cbe-5b818a2e657c"
}
},
{
"vd.pec" : {
"$gt" : 1
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/a24352c9-69f4-4ada-be42-ad223968ef79%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2015-12-01 16:55:31 UTC

Permalink

Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.

So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.

I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.

-AP_

Post by Asya Kamsky
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:*3487* reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
{ r: 9508 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the logs
- especially in the old log since that version may be harder to track down!
Asya
You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before doing
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_
i'm a bit confused as the executionTimeMillis in the explain() output is
fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it matters
here but good to know)
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b818a2e657c",
"vd.pec" : {"$gt" : 1}
}
)
{
"v" : 1,
"key" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at http://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/f00586aa-edba-42cd-a81d-928110b6bb7f%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2016-01-08 17:15:06 UTC

Permalink

Asya, I back to this issue. We have since updated to 3.0.8.

Jan 8 17:07:56 ### mongod.27000[25120]: [conn4087] command cdc.$cmd
command: count { count: "events-qos-buffering", query: { vd.ple.vd.pid:
"57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY" } }
planSummary: IXSCAN { vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts:
1 } keyUpdates:0 writeConflicts:0 numYields:2158 reslen:44 locks:{ Global:
{ acquireCount: { r: 4318 }, acquireWaitCount: { r: 422 },
timeAcquiringMicros: { r: 3075915 } }, Database: { acquireCount: { r: 2159
} }, Collection: { acquireCount: { r: 2159 } } } 35693ms

The count returns only 197,855

However, timeAcquiringMicros seems a bit high. Still using WiredTiger.

Last time we did not return to this issue as after the restart things were
running quite happy and fast. Now, we are back to these issues.

What can I do to diagnose some more?

Thanks.
-AP_

Post by Alex Paransky
Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.
So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.
I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.
-AP_
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:*3487* reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
{ r: 9508 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the logs
- especially in the old log since that version may be harder to track down!
Asya
You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before doing
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 50301149184,
"ok" : 1
}
Thanks.
-AP_
i'm a bit confused as the executionTimeMillis in the explain() output is
fast. Did the explain run quickly?
Btw are you using wildtiger or mmap storage engine? (not sure it matters
here but good to know)
I am executing a query which is taking over 45 seconds to return (other
db.getCollection("events-qos-loadstart").count(
{
"vd.ple.vd.acc" : "EY",
"vd.ple.vd.pid" : "313c0296-5469-59f7-7cbe-5b81
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at https://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/17a7ef10-779e-4e44-ada8-e6f321d167c5%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2016-01-08 17:17:11 UTC

Permalink

It seems that we are once again using IXSCAN instead of COUNT_SCAN, why?

-AP_

Post by Alex Paransky
Asya, I back to this issue. We have since updated to 3.0.8.
Jan 8 17:07:56 ### mongod.27000[25120]: [conn4087] command cdc.$cmd
"57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY" } }
{ acquireCount: { r: 4318 }, acquireWaitCount: { r: 422 },
timeAcquiringMicros: { r: 3075915 } }, Database: { acquireCount: { r: 2159
} }, Collection: { acquireCount: { r: 2159 } } } 35693ms
The count returns only 197,855
However, timeAcquiringMicros seems a bit high. Still using WiredTiger.
Last time we did not return to this issue as after the restart things were
running quite happy and fast. Now, we are back to these issues.
What can I do to diagnose some more?
Thanks.
-AP_

Post by Alex Paransky
Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.
So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.
I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.
-AP_
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3487* reslen:44 locks:{ Global: {
{ r: 261535 } }, Database: { acquireCount: { r: 3488 } }, Collection: {
acquireCount: { r: 3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 }, acquireWaitCount: { r: 9 },
timeAcquiringMicros: { r: 9508 } }, Database: { acquireCount: { r: 3059
} }, Collection: { acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the
logs - especially in the old log since that version may be harder to track
down!
Asya
You are correct. There ARE writes which are happening, and there could
be quite a bit of them. A LOT. All of them do come from replication so
there are no "direct" client writes. I have about 21 collections in total
and they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without
going into swap) when things were running slow. After restarting the box,
memory was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is
so slow is because it's yielding. A lot. The question is why. There've
been some bugs fixed that cause a query to yield too much, but none that I
could find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no
such thing as read-only secondary if it's replicating writes from the
primary.
It has to repeat every single write that the primary does. All of
them. So I suspect there is a lot of writing actually going on (though
it's not clear why it would be yielding so often unless there was some flaw
in the algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log
level, running this count a couple of times waiting maybe 30 seconds
between them, then setting log level back to "normal". This will give you
a second of the log with all operations logged. Send that to
mplotqueries <https://www.google.com/search?q=mplotqueries> and see if
that picture tells you anything interesting (if not, you're welcome to post
it here and we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"numExtents" : 0,
"indexes" : 131,
"indexSize" : 503011

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at https://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/7bc03573-78b2-40ef-9776-5664ad89f684%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2016-01-08 19:09:54 UTC

Permalink

I think there is something strange going on. The explain shows that the
winning plan is going to use COUNT_SCAN, but when I actually run the
command it executes IXSCAN.

db.getCollection('events-qos-loadstart').explain().count({"vd.ple.vd.pid":
"57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})

{

"queryPlanner" : {

"plannerVersion" : 1,

"namespace" : "cdc.events-qos-loadstart",

"indexFilterSet" : false,

"parsedQuery" : {

"$and" : [

{

"vd.ple.vd.acc" : {

"$eq" : "EY"

}

},

{

"vd.ple.vd.pid" : {

"$eq" : "57de9139-cc7e-4b5c-51fd-aaa8517028f0"

}

}

]

},

"winningPlan" : {

"stage" : "COUNT",

"inputStage" : {

"stage" : "COUNT_SCAN",

"keyPattern" : {

"vd.ple.vd.acc" : 1,

"vd.ple.vd.pid" : 1,

"vd.pec" : 1,

"vd.ts" : 1

},

"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",

"isMultiKey" : false

}

},

"rejectedPlans" : [ ]

},

"serverInfo" : {

"host" : "ec2-52-23-180-212",

"port" : 27000,

"version" : "3.0.8",

"gitVersion" : "83d8cc25e00e42856924d84e220fbe4a839e605d"

},

"ok" : 1

}

Yet, when I actually execute the statement:

db.getCollection('events-qos-loadstart').count({"vd.ple.vd.pid":
"57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})

Jan 8 19:06:13 ec2-52-23-180-212 mongod.27000[25120]: [conn4144] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.pid: "57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY"
}, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc: 1, vd.ple.vd.pid: 1,
vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0 numYields:2693
reslen:44 locks:{ Global: { acquireCount: { r: 5388 }, acquireWaitCount: {
r: 300 }, timeAcquiringMicros: { r: 927544 } }, Database: { acquireCount: {
r: 2694 } }, Collection: { acquireCount: { r: 2694 } } } 45817ms

I don't know what's causing this at this point. I am running the latest
production version of MongoDB (3.0.8).

-AP_

Post by Alex Paransky
It seems that we are once again using IXSCAN instead of COUNT_SCAN, why?
-AP_
Asya, I back to this issue. We have since updated to 3.0.8.
Jan 8 17:07:56 ### mongod.27000[25120]: [conn4087] command cdc.$cmd
"57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY" } }
{ acquireCount: { r: 4318 }, acquireWaitCount: { r: 422 },
timeAcquiringMicros: { r: 3075915 } }, Database: { acquireCount: { r: 2159
} }, Collection: { acquireCount: { r: 2159 } } } 35693ms
The count returns only 197,855
However, timeAcquiringMicros seems a bit high. Still using WiredTiger.
Last time we did not return to this issue as after the restart things were
running quite happy and fast. Now, we are back to these issues.
What can I do to diagnose some more?
Thanks.
-AP_
Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.
So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.
I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.
-AP_
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:*3487* reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
{ r: 9508 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the logs
- especially in the old log since that version may be harder to track down!
Asya
You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before doing
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
"internal pages evicted" : 50252,
"pages split during eviction" : 28366,
"in-memory page splits" : 0,
"overflow values cached in memory" : 0,
"pages read into cache" : 12062318,
"overflow pages read into cache" : 0,
"pages written from cache" : 540301
},
"compression" : {
"raw compression call failed, no additional data available" : 73292,
"raw compression call failed, additional data available" : 21316,
"raw compression call succeeded" : 495946,
"compressed pages read" : 12047447,
"compressed pages written" : 9749,
"page written failed to compress" : 0,
"page written was too small to compress" : 63543
},
"cursor" : {
"create calls" : 8236,
"insert calls" : 1337876,
"bulk-loaded cursor-insert calls" : 0,
"cursor-insert key and value bytes inserted" : 6266710027,
"next calls" : 102,
"prev calls" : 1,
"remove calls" : 0,
"cursor-remove key bytes removed" : 0,
"reset calls" : 304156030,
"search calls" : 308636727,
"search near calls" : 0,
"update calls" : 0,
"cursor-update value bytes updated" : 0
},
"reconciliation" : {
"dictionary matches" : 0,
"internal page multi-block writes" : 5637,
"leaf page multi-block writes" : 30643,
"maximum blocks required for a page" : 0,
"internal-page overflow keys" : 0,
"leaf-page overflow keys" : 0,
"overflow values written" : 0,
"pages deleted" : 0,
"page checksum matches" : 7621,
"page reconciliation calls" : 523383,
"page reconciliation calls for eviction" : 455853,
"leaf page key bytes discarded using prefix compression" : 0,
"internal page key bytes discarded using suffix compression" : 0
},
"session" : {
"object compaction" : 0,
"open cursor count" : 8236
},
"transaction" : {
"update conflicts" : 0
}
},
"nindexes" : 4,
"totalIndexSize" : 288075776,
"indexSizes" : {
"_id_" : 166514688,
"vd.ple.vd.acc_1_vd.ts_1_vd.pec_1" : 39645184,
"vd.ts_1_vd.pec_1" : 39198720,
"vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1" : 42717184
},
"ok" : 1
}
The execution plan query came back very fast. The count query sits there
for at least 45 to 50 seconds. We are using MontoDB 3.0.7 with using
WiredTiger.
{
"db" : "cdc",
"collections" : 32,
"objects" : 409884155,
"avgObjSize" : 3860.0832105378654,
"dataSize" : 1582186944981,
"storageSize" : 153111339008,
"nu
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at https://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/eea3d35c-77c6-4a3a-bff0-f2173e02a7c5%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2016-01-08 21:20:55 UTC

Permalink

After restarting the server, the count returns instantaneously.

-AP_

Post by Alex Paransky
I think there is something strange going on. The explain shows that the
winning plan is going to use COUNT_SCAN, but when I actually run the
command it executes IXSCAN.
"57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "57de9139-cc7e-4b5c-51fd-aaa8517028f0"
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "ec2-52-23-180-212",
"port" : 27000,
"version" : "3.0.8",
"gitVersion" : "83d8cc25e00e42856924d84e220fbe4a839e605d"
},
"ok" : 1
}
"57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})
Jan 8 19:06:13 ec2-52-23-180-212 mongod.27000[25120]: [conn4144] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.pid: "57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY"
}, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc: 1, vd.ple.vd.pid: 1,
vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0 numYields:2693
reslen:44 locks:{ Global: { acquireCount: { r: 5388 }, acquireWaitCount: {
r: 300 }, timeAcquiringMicros: { r: 927544 } }, Database: { acquireCount: {
r: 2694 } }, Collection: { acquireCount: { r: 2694 } } } 45817ms
I don't know what's causing this at this point. I am running the latest
production version of MongoDB (3.0.8).
-AP_
It seems that we are once again using IXSCAN instead of COUNT_SCAN, why?
-AP_
Asya, I back to this issue. We have since updated to 3.0.8.
Jan 8 17:07:56 ### mongod.27000[25120]: [conn4087] command cdc.$cmd
"57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY" } }
{ acquireCount: { r: 4318 }, acquireWaitCount: { r: 422 },
timeAcquiringMicros: { r: 3075915 } }, Database: { acquireCount: { r: 2159
} }, Collection: { acquireCount: { r: 2159 } } } 35693ms
The count returns only 197,855
However, timeAcquiringMicros seems a bit high. Still using WiredTiger.
Last time we did not return to this issue as after the restart things were
running quite happy and fast. Now, we are back to these issues.
What can I do to diagnose some more?
Thanks.
-AP_
Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.
So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.
I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.
-AP_
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:*3487* reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
{ r: 9508 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the logs
- especially in the old log since that version may be harder to track down!
Asya
You are correct. There ARE writes which are happening, and there could be
quite a bit of them. A LOT. All of them do come from replication so there
are no "direct" client writes. I have about 21 collections in total and
they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before doing
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without going
into swap) when things were running slow. After restarting the box, memory
was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is so
slow is because it's yielding. A lot. The question is why. There've been
some bugs fixed that cause a query to yield too much, but none that I could
find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no such
thing as read-only secondary if it's replicating writes from the primary.
It has to repeat every single write that the primary does. All of them.
So I suspect there is a lot of writing actually going on (though it's not
clear why it would be yielding so often unless there was some flaw in the
algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log level,
running this count a couple of times waiting maybe 30 seconds between them,
then setting log level back to "normal". This will give you a second of
the log with all operations logged. Send that to mplotqueries
<https://www.google.com/search?q=mplotqueries> and see if that picture
tells you anything interesting (if not, you're welcome to post it here and
we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not exist"
: 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at https://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/0e193619-0e8d-4ad4-9eb3-09972926f281%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Alex Paransky

2016-01-11 18:46:06 UTC

Permalink

I created a BUG https://jira.mongodb.org/browse/SERVER-22133 to track this
issue

Post by Alex Paransky
After restarting the server, the count returns instantaneously.
-AP_

Post by Alex Paransky
I think there is something strange going on. The explain shows that the
winning plan is going to use COUNT_SCAN, but when I actually run the
command it executes IXSCAN.
db.getCollection('events-qos-loadstart').explain().count({"vd.ple.vd.pid"
: "57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "cdc.events-qos-loadstart",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"vd.ple.vd.acc" : {
"$eq" : "EY"
}
},
{
"vd.ple.vd.pid" : {
"$eq" : "57de9139-cc7e-4b5c-51fd-aaa8517028f0"
}
}
]
},
"winningPlan" : {
"stage" : "COUNT",
"inputStage" : {
"stage" : "COUNT_SCAN",
"keyPattern" : {
"vd.ple.vd.acc" : 1,
"vd.ple.vd.pid" : 1,
"vd.pec" : 1,
"vd.ts" : 1
},
"indexName" : "vd.ple.vd.acc_1_vd.ple.vd.pid_1_vd.pec_1_vd.ts_1",
"isMultiKey" : false
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "ec2-52-23-180-212",
"port" : 27000,
"version" : "3.0.8",
"gitVersion" : "83d8cc25e00e42856924d84e220fbe4a839e605d"
},
"ok" : 1
}
"57de9139-cc7e-4b5c-51fd-aaa8517028f0","vd.ple.vd.acc": "EY"})
Jan 8 19:06:13 ec2-52-23-180-212 mongod.27000[25120]: [conn4144] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.pid: "57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY"
}, fields: {} } planSummary: IXSCAN { vd.ple.vd.acc: 1, vd.ple.vd.pid: 1,
vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0 numYields:2693
reslen:44 locks:{ Global: { acquireCount: { r: 5388 }, acquireWaitCount: {
r: 300 }, timeAcquiringMicros: { r: 927544 } }, Database: { acquireCount: {
r: 2694 } }, Collection: { acquireCount: { r: 2694 } } } 45817ms
I don't know what's causing this at this point. I am running the latest
production version of MongoDB (3.0.8).
-AP_
It seems that we are once again using IXSCAN instead of COUNT_SCAN, why?
-AP_
Asya, I back to this issue. We have since updated to 3.0.8.
Jan 8 17:07:56 ### mongod.27000[25120]: [conn4087] command cdc.$cmd
"57de9139-cc7e-4b5c-51fd-aaa8517028f0", vd.ple.vd.acc: "EY" } }
{ acquireCount: { r: 4318 }, acquireWaitCount: { r: 422 },
timeAcquiringMicros: { r: 3075915 } }, Database: { acquireCount: { r: 2159
} }, Collection: { acquireCount: { r: 2159 } } } 35693ms
The count returns only 197,855
However, timeAcquiringMicros seems a bit high. Still using WiredTiger.
Last time we did not return to this issue as after the restart things
were running quite happy and fast. Now, we are back to these issues.
What can I do to diagnose some more?
Thanks.
-AP_
Interesting observation. Our logs are being written to syslog, and our
servers typically run for a while. So, I was not able to "definitively"
find out which version of the server was running when the IXSCAN appeared
in the explain plan, however, from MongoDB Cloud Manager I see that this
machine was updated to version 3.0.7 (from 3.0.4 previous version) on
10/20/15 - 10:35:11.
So based on the timestamp of Nov 23, version 3.0.7 was already running
during both of these tests.
I am working on creating some more tests to see if the slow down returns,
but at this point, it seems that the restart of the machine has "fixed" the
issue of slow queries.
-AP_
Very interesting results.
While it _could_ be a memory leak, I wouldn't necessarily jump to that
conclusion. Of course if the performance gets bad and restarting the
server magically fixes it that's a tempting conclusion to embrace, but I
wonder if you noticed another very interesting difference in the logs
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: IXSCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3487* reslen:44 locks:{ Global: {
{ r: 261535 } }, Database: { acquireCount: { r: 3488 } }, Collection: {
acquireCount: { r: 3488 } } } 37515ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 }, acquireWaitCount: { r: 9 },
timeAcquiringMicros: { r: 9508 } }, Database: { acquireCount: { r: 3059
} }, Collection: { acquireCount: { r: 3059 } } } 115ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:*3058* reslen:44 locks:{ Global: {
acquireCount: { r: 6118 } }, Database: { acquireCount: { r: 3059 } },
Collection: { acquireCount: { r: 3059 } } } 105ms
Two *very* different looking lines, because they seem to be output a
different plan summary!
So what's going on? I'm going to go out on a limb and guess that maybe
what was running "before" was a different version than what is running
"after". So maybe there _was_ a bug fixed, but it's not about number of
yields as those are about the same...
Would you check the version running now - best way to check is in the
logs - especially in the old log since that version may be harder to track
down!
Asya
You are correct. There ARE writes which are happening, and there could
be quite a bit of them. A LOT. All of them do come from replication so
there are no "direct" client writes. I have about 21 collections in total
and they are all being replicated to, however, only THIS collection is
experiencing relatively "slow" performance times. Yes, there is no such
things as read-only secondary replica from Mongo's point of view. So,
let's make sure we are on the same page. This is a SECONDARY server
replicating data from the PRIMARY and is only used to run read-only
aggregations.
I took the route of taking the server out of the replication, before
Nov 23 20:18:47 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4146 reslen:44 locks:{ Global: { acquireCount: { r: 8294 },
acquireWaitCount: { r: 161 }, timeAcquiringMicros: { r: 437064 } },
4147 } } } 64663ms
Nov 23 20:20:17 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3708 reslen:44 locks:{ Global: { acquireCount: { r: 7418 },
acquireWaitCount: { r: 111 }, timeAcquiringMicros: { r: 302599 } },
3709 } } } 47897ms
Nov 23 20:21:19 ec2-54-175-62-165 mongod.27000[9873]: [conn96865] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:3487 reslen:44 locks:{ Global: { acquireCount: { r: 6976 },
acquireWaitCount: { r: 112 }, timeAcquiringMicros: { r: 261535 } },
3488 } } } 37515ms
AFTER SHUTTING DOWN THE SERVER AND STARTING WITHOUT REPLICATION (on a
Nov 23 20:30:57 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 150ms
Nov 23 20:31:13 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:31:17 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 107ms
Nov 23 20:31:20 ec2-54-175-62-165 mongod.27001[30855]: [conn3] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:10 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 }, acquireWaitCount: { r: 9 }, timeAcquiringMicros: { r: 9508 } },
3059 } } } 115ms
Nov 23 20:43:14 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 104ms
Nov 23 20:43:17 ec2-54-175-62-165 mongod.27000[31167]: [conn47] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
vd.pec: { $gt: 1.0 } }, fields: {} } planSummary: COUNT_SCAN {
vd.ple.vd.acc: 1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0
writeConflicts:0 numYields:3058 reslen:44 locks:{ Global: { acquireCount: {
r: 6118 } }, Database: { acquireCount: { r: 3059 } }, Collection: {
acquireCount: { r: 3059 } } } 105ms
So, suddenly, things are running quite fast.
We don't have too much memory on this machine (it's a 15 gig box in EC2)...
total used free shared buffers cached
Mem: 15042 6911 8131 0 60 3663
-/+ buffers/cache: 3187 11855
Swap: 0 0 0
Total: 15042 6911 8131
We did notice that the memory on the box was fully utilized (without
going into swap) when things were running slow. After restarting the box,
memory was not fully utilized.
I will monitor the machine some more. Could this be a memory leak?
-AP_
First, no, I don't think recreating an index is going to do anything.
Second, your comment about read-only seems strange - if there is any
replication that means there are writes.
If there is no writing happening, then there is no replication happening
either, i.e. secondaries are waiting for primary to do some writes so that
they can "repeat" them.
Are you *sure* there are no writes happening? The reason this count is
so slow is because it's yielding. A lot. The question is why. There've
been some bugs fixed that cause a query to yield too much, but none that I
could find were affecting your exact version.
I just realized you said "read-only secondary replica" - there is no
such thing as read-only secondary if it's replicating writes from the
primary.
It has to repeat every single write that the primary does. All of
them. So I suspect there is a lot of writing actually going on (though
it's not clear why it would be yielding so often unless there was some flaw
in the algorithm that decides how often to yield.
Btw, you can prove whether the writes are the issue or not by temporarily
stopping replication and then running this same query/count a few times.
Please only do this if you have several secondaries in this replica set, I
don't want you to risk your replica set availability for this. In fact,
this collection is only about 15GB - you could dump it and restore it into
a standalone mongod that really _will_ be read-only and see if the
performance on there is a lot faster (in particular the number of yields is
key). If that's the case then it would be important to determine whether
the system is simply unable to keep up with heavy mixed workload or if
something else is the issue.
Repeat the full experiment as above but first turning on higher log
level, running this count a couple of times waiting maybe 30 seconds
between them, then setting log level back to "normal". This will give you
a second of the log with all operations logged. Send that to
mplotqueries <https://www.google.com/search?q=mplotqueries> and see if
that picture tells you anything interesting (if not, you're welcome to post
it here and we can all take a look).
Whether or not there is a bug in the version you're running we don't know
about causing "too-frequent-yields" or not, there is something in your
set-up that's triggering those yields - we don't normally see indexed
counts of collections with <3M records take anywhere near this long.
You also say "other queries run quite fast, within 2 seconds" but 2
seconds is *not* very fast at all! If you're running into a subtle bug,
then best way to improve your performance would be to figure out what the
bug is so that we can fix it (for everyone, not just you :) ).
Asya
I am reading that timeAcquiringMicros: { r: 16500145 } } represents the
time it took to getting the locks. That seems a bit high of 16.5 seconds.
There are no writes being done against this database (other than
replication). Can locking be disabled? Other than replication, this is
essentially a read-only database.
We are using WiredTiger on 3.0.7 server. What things can we try to do to
improve the performance of this read-only secondary replica?
Thanks.
-AP_
Do you think re-creating index will make a difference?
Asya,
Nov 18 15:38:04 ec2-54-175-62-165 *mongod*.27000[9873]: [conn76138]
command cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 715 }, timeAcquiringMicros: { r: 16500145 } },
4540 } } } 87137ms
Nov 18 15:43:26 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4541 reslen:44 locks:{ Global: { acquireCount: { r: 9084 },
acquireWaitCount: { r: 730 }, timeAcquiringMicros: { r: 15110480 } },
4542 } } } 87745ms
Nov 18 15:45:49 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4539 reslen:44 locks:{ Global: { acquireCount: { r: 9080 },
acquireWaitCount: { r: 919 }, timeAcquiringMicros: { r: 16248981 } },
4540 } } } 86816ms
Nov 18 15:47:34 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:4267 reslen:44 locks:{ Global: { acquireCount: { r: 8536 },
acquireWaitCount: { r: 622 }, timeAcquiringMicros: { r: 11486260 } },
4268 } } } 74458ms
The count returned is always 390936. So this data is not changing.
Slightly different version of this query with $eq instead of $gt returns
Nov 18 15:51:20 ec2-54-175-62-165 mongod.27000[9873]: [conn76138] command
cdc.$cmd command: count { count: "events-qos-loadstart", query: {
vd.ple.vd.acc: "EY", vd.ple.vd.pid: "313c0296-5469-59f7-7cbe-5b818a2e657c",
1, vd.ple.vd.pid: 1, vd.pec: 1, vd.ts: 1 } keyUpdates:0 writeConflicts:0
numYields:18 reslen:44 locks:{ Global: { acquireCount: { r: 38 },
{ acquireCount: { r: 19 } }, Collection: { acquireCount: { r: 19 } } } 329ms
The result of this count is: 1541
So it seems that doing an index scan is taking a long time?
Thanks for your help.
-AP_
Since the command takes this long, there will be a line for it in the
mongod log - can you include that here please?
It might help if you run the count a couple of times and see if the
results are more or less the same performance wise.
{
"ns" : "cdc.events-qos-loadstart",
"count" : 2800752,
"size" : 16706527988,
"avgObjSize" : 5965,
"storageSize" : 1500979200,
"capped" : false,
"wiredTiger" : {
"metadata" : {
"formatVersion" : 1
},
"allocation_size=4KB,app_metadata=(formatVersion=1),block_allocation=best,block_compressor=zlib,cache_resident=0,checkpoint=(WiredTigerCheckpoint.64831=(addr=\"01e3018c3181e49cbf413ee3018c6a81e46d6e131fe3018c7b81e4ca799905808080e459751fc0e469318fc0\",order=64831,time=1447793646,size=1764864000,write_gen=1230720)),checkpoint_lsn=(10558,52273664),checksum=on,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=71,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)",
"type" : "file",
"uri" : "statistics:table:collection-8--1012075229251210100",
"LSM" : {
"bloom filters in the LSM tree" : 0,
"bloom filter false positives" : 0,
"bloom filter hits" : 0,
"bloom filter misses" : 0,
"bloom filter pages evicted from cache" : 0,
"bloom filter pages read into cache" : 0,
"total size of bloom filters" : 0,
"sleep for LSM checkpoint throttle" : 0,
"chunks in the LSM tree" : 0,
"highest merge generation in the LSM tree" : 0,
"queries that could have benefited from a Bloom filter that did not
exist" : 0,
"sleep for LSM merge throttle" : 0
},
"block-manager" : {
"file allocation unit size" : 4096,
"blocks allocated" : 581831,
"checkpoint size" : 1764864000,
"allocations requiring file extension" : 4304,
"blocks freed" : 489809,
"file magic number" : 120897,
"file major version number" : 1,
"minor version number" : 0,
"file bytes available for reuse" : 1568768,
"file size in bytes" : 1500979200
},
"btree" : {
"btree checkpoint generation" : 25478,
"column-store variable-size deleted values" : 0,
"column-store fixed-size leaf pages" : 0,
"column-store internal pages" : 0,
"column-store variable-size leaf pages" : 0,
"pages rewritten by compaction" : 0,
"number of key/value pairs" : 0,
"fixed-record size" : 0,
"maximum tree depth" : 5,
"maximum internal page key size" : 368,
"maximum internal page size" : 4096,
"maximum leaf page key size" : 3276,
"maximum leaf page size" : 32768,
"maximum leaf page value size" : 1048576,
"overflow pages" : 0,
"row-store internal pages" : 0,
"row-store leaf pages" : 0
},
"cache" : {
"bytes read into cache" : NumberLong("2843839082589"),
"bytes written from cache" : 65522296650,
"checkpoint blocked page eviction" : 4,
"unmodified pages evicted" : 11530699,
"page split during eviction deepened the tree" : 0,
"modified pages evicted" : 424472,
"data source pages selected for eviction unable to be evicted" : 77762,
"hazard pointer blocked page eviction" : 20840,
...

--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.

For other MongoDB technical support options, see: http://www.mongodb.org/about/support/.
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user+***@googlegroups.com.
To post to this group, send email to mongodb-***@googlegroups.com.
Visit this group at https://groups.google.com/group/mongodb-user.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/df6860f3-c255-43d2-a404-511cf0fd4708%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.