Hi there, today I tried out the suggestions you gave me.
I had around 20 fields in the full data set named bidding_data. Subsequently, I extracted 3 fields and moved them to a new collection named new_collection.
These are the 3 fields.
{
"_id". // This is my ObjectId.
"terms" // This is the field where I manually segmented and stored the data with spaces. He appears as follows: "ChineseWord1 ChineseWord2 ChineseWord3"
"publishTime" // This is my publication time. I wish to utilize it for data filtering.
}
I created a full-text index as follows, which is identical to the one previously created in the bidding_data.
db.collection.createIndex({ terms: "text", publicTime: -1 })
Next, I have my bidding_data collection and the new_collection collection, along with the storage space they take up.
The storage size of my bidding_data collection
{
"Number of documents" : "176,631",
"Total size in memory" : "4.63 GB (4,973,306,470)",
"Average object size" : "27.50 KB (28,156)",
"Storage size" : "986.11 MB (1,034,014,720)",
"Total index size" : "1008.13 MB (1,057,099,776)"
}
The storage size of my new_collection collection
{
"Number of documents" : "176,631",
"Total size in memory" : "33.24 MB (34,854,347)",
"Average object size" : "197 bytes (197)",
"Storage size" : "16.39 MB (17,190,912)",
"Total index size" : "54.67 MB (57,327,616)"
}
From the above, it can be seen that the size of this new collection has been reduced from the original collection’s 986MB to 16MB, a significant difference of 60 times. Following your advice, I used the $lookup search, and my execution statement is as follows.
db.new_collection.aggregate([
{
$match: {
"$text": { "$language": "none", "$search": "公司 有限公司 投标 招标" },
"publishtime": { "$gte": ISODate("2021-01-12T00:00:00Z"), "$lte": ISODate("2024-01-16T23:59:59Z") }
}
},
{
$sort: {
"score": { "$meta": "textScore" },
"publishtime": -1
}
},
{
$skip: 0
},
{
$limit: 10
},
{
$project: {
"_id": 1
}
},
{
$lookup: {
from: "bidding_data",
localField: "_id",
foreignField: "_id",
as: "bidding_data"
}
}
])
The result was achieved in only 0.4 seconds.
His plan of action is as follows.
// 1
{
"stages": [
{
"$cursor": {
"queryPlanner": {
"plannerVersion": NumberInt("1"),
"namespace": "bidding_data.new_collection",
"indexFilterSet": false,
"parsedQuery": {
"$and": [
{
"publishtime": {
"$lte": ISODate("2024-01-16T23:59:59.000Z")
}
},
{
"publishtime": {
"$gte": ISODate("2021-01-12T00:00:00.000Z")
}
},
{
"$text": {
"$search": "公司 有限公司 投标 招标",
"$language": "none",
"$caseSensitive": false,
"$diacriticSensitive": false
}
}
]
},
"queryHash": "DEF90E69",
"planCacheKey": "E8972E79",
"winningPlan": {
"stage": "PROJECTION_SIMPLE",
"transformBy": {
"_id": true
},
"inputStage": {
"stage": "SORT",
"sortPattern": {
"$computed0": {
"$meta": "textScore"
},
"publishtime": NumberInt("-1")
},
"memLimit": NumberInt("104857600"),
"limitAmount": NumberInt("10"),
"type": "default",
"inputStage": {
"stage": "TEXT",
"indexPrefix": { },
"indexName": "text_index",
"parsedTextQuery": {
"terms": [
"公司",
"投标",
"招标",
"有限公司"
],
"negatedTerms": [ ],
"phrases": [ ],
"negatedPhrases": [ ]
},
"textIndexVersion": NumberInt("3"),
"inputStage": {
"stage": "TEXT_MATCH",
"inputStage": {
"stage": "TEXT_OR",
"filter": {
"$and": [
{
"publishtime": {
"$lte": ISODate("2024-01-16T23:59:59.000Z")
}
},
{
"publishtime": {
"$gte": ISODate("2021-01-12T00:00:00.000Z")
}
}
]
},
"inputStages": [
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
}
]
}
}
}
}
},
"rejectedPlans": [ ]
},
"executionStats": {
"executionSuccess": true,
"nReturned": NumberInt("10"),
"executionTimeMillis": NumberInt("458"),
"totalKeysExamined": NumberInt("102154"),
"totalDocsExamined": NumberInt("62355"),
"executionStages": {
"stage": "PROJECTION_SIMPLE",
"nReturned": NumberInt("10"),
"executionTimeMillisEstimate": NumberInt("107"),
"works": NumberInt("169617"),
"advanced": NumberInt("10"),
"needTime": NumberInt("169606"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"transformBy": {
"_id": true
},
"inputStage": {
"stage": "SORT",
"nReturned": NumberInt("10"),
"executionTimeMillisEstimate": NumberInt("107"),
"works": NumberInt("169617"),
"advanced": NumberInt("10"),
"needTime": NumberInt("169606"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"sortPattern": {
"$computed0": {
"$meta": "textScore"
},
"publishtime": NumberInt("-1")
},
"memLimit": NumberInt("104857600"),
"limitAmount": NumberInt("10"),
"type": "default",
"totalDataSizeSorted": NumberInt("19739136"),
"usedDisk": false,
"inputStage": {
"stage": "TEXT",
"nReturned": NumberInt("62355"),
"executionTimeMillisEstimate": NumberInt("95"),
"works": NumberInt("169606"),
"advanced": NumberInt("62355"),
"needTime": NumberInt("107250"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"indexPrefix": { },
"indexName": "text_index",
"parsedTextQuery": {
"terms": [
"公司",
"投标",
"招标",
"有限公司"
],
"negatedTerms": [ ],
"phrases": [ ],
"negatedPhrases": [ ]
},
"textIndexVersion": NumberInt("3"),
"inputStage": {
"stage": "TEXT_MATCH",
"nReturned": NumberInt("62355"),
"executionTimeMillisEstimate": NumberInt("94"),
"works": NumberInt("169606"),
"advanced": NumberInt("62355"),
"needTime": NumberInt("107250"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"docsRejected": NumberInt("0"),
"inputStage": {
"stage": "TEXT_OR",
"filter": {
"$and": [
{
"publishtime": {
"$lte": ISODate("2024-01-16T23:59:59.000Z")
}
},
{
"publishtime": {
"$gte": ISODate("2021-01-12T00:00:00.000Z")
}
}
]
},
"nReturned": NumberInt("62355"),
"executionTimeMillisEstimate": NumberInt("94"),
"works": NumberInt("169606"),
"advanced": NumberInt("62355"),
"needTime": NumberInt("107250"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"docsExamined": NumberInt("62355"),
"inputStages": [
{
"stage": "IXSCAN",
"nReturned": NumberInt("31193"),
"executionTimeMillisEstimate": NumberInt("2"),
"works": NumberInt("31194"),
"advanced": NumberInt("31193"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("31193"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("31193"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("518"),
"executionTimeMillisEstimate": NumberInt("0"),
"works": NumberInt("519"),
"advanced": NumberInt("518"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("518"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("518"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("53254"),
"executionTimeMillisEstimate": NumberInt("21"),
"works": NumberInt("53255"),
"advanced": NumberInt("53254"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("53254"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("53254"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("17189"),
"executionTimeMillisEstimate": NumberInt("3"),
"works": NumberInt("17190"),
"advanced": NumberInt("17189"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("170"),
"restoreState": NumberInt("170"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishtime": NumberInt("-1")
},
"indexName": "text_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("17189"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("17189"),
"dupsDropped": NumberInt("0")
}
]
}
}
}
}
}
}
},
"nReturned": NumberLong("10"),
"executionTimeMillisEstimate": NumberLong("456")
},
{
"$lookup": {
"from": "bidding_data",
"as": "bidding_data",
"localField": "_id",
"foreignField": "_id"
},
"nReturned": NumberLong("10"),
"executionTimeMillisEstimate": NumberLong("458")
}
],
"serverInfo": {
"host": "localhost.localdomain",
"port": NumberInt("27017"),
"version": "4.4.9",
"gitVersion": "b4048e19814bfebac717cf5a880076aa69aba481"
},
"ok": 1
}
As a result of the reduced document size, the totalDataSizeSorted in the sorting stage has decreased from 2.18 GB (2342696268) to 18.8 MB (19739136). Consequently, MongoDB only required 0.4 seconds to generate the result.
But I do have a few questions.
In my current test environment, I only used 0.4 seconds for 170,000 pieces of data. However, in my production environment, there are 5 million pieces of data, which is 29 times the difference in data volume. Given this 5 million data volume, can I anticipate a 29-fold increase in response time? Is this approach viable?
Have I misconstrued your suggestion? Are you proposing that I manually construct an inverted index using this new collection, or simply transfer the necessary search fields to the new collection to decrease the document size?
I also want to use Atlas, but I realized that all Atlas servers are located in Europe. Since our clients are in China, having the servers in Europe would result in very slow network latency for user searches.
If MongoDB doesn’t pan out, our last resort is to consider utilizing Elasticsearch.