Aggregate
- 使用Mongodb内置的原生操作,聚合效率非常高类似于SQL Group By 操作。
- 每个阶段管道限制为100MB的内存
SQL |
Mongodb |
where |
$match |
group by |
$group |
having |
$match |
selet |
$project |
order by |
$sort |
limit |
$limit |
sum() |
$sum |
count() |
$sum |
join |
$lookup |
db.collection.aggregate(pipeline<array>, options<document>)
- 查询记录条数
db.collection.aggregate({
$group : {
_id: null,
count: {$sum: 1}
}
})
// Sql
select count(*) as count from table;
- 查询某个字段之和
db.collection.aggregate({
$group : {
_id: null,
count: {$sum: "$age"}
}
})
// Sql
select sum(age) as count from table;
- 以某个字段为键,求和
db.collection.aggregate({
$group : {
_id: "$sex",
count: {$sum: "$age"}
}
})
// Sql
select sex, sum(age) as count from table group by sex;
- 多个字段为键,进行求和
db.collection.aggregate({
$group: {
_id: {
crawl_name: "$crawl_name",
get_date: "$get_date"
},
count: {
$sum: 1
}
}
})
// sql
select crawl_name, get_date, count(*) from table group by crawl_name, get_date;
- 对聚合的字段进行过滤
db.collection.aggregate([{
$match: {
get_date: {
$gte: 20200701
}
}
}, {
$group: {
_id: {
crawl_name: "$crawl_name",
get_date: "$get_date"
},
count: {
$sum: 1
}
}
}, {
$match: {
count: {
$gte: 10
}
}
}])
// sql
select crawl_name, get_date, count(*)
from table
where get_date >= 20200701
group by crawl_name, get_date
having count(*) > 10;
MapReduce
var map = function() {
emit(this.crawl_name, { count: 1 });
}
var reduce = function(key, emits) {
total = 0
for (var i in emits) {
total += emits[i].count;
}
return {
"count": total
};
}
mr = db.runCommand({
"mapreduce": "collection",
"map": map,
"reduce": reduce,
"out": {inline: 1}
})