MongoDB complex select count group by function - mysql

I have a collection called 'my_emails' where are stored email addresses :
[
{ email:"russel#gmail.com"},
{ email:"mickey#yahoo.com"},
{ email:"john#yahoo.com"},
]
and I try to get the top 10 hostnames used...
[
{host: "gmail.com", count: 1000},
{host: "yahoo.com", count: 989}, ...
]
if I had MySQL, I’ll do this query :
SELECT substr(email,locate('#',email)+1,255) AS host,count(1) AS count
FROM my_emails
WHERE email like '%#%'
GROUP BY substr(email,locate('#',email)+1,255)
ORDER BY count(1) DESC
LIMIT 10
how can I do with mongodb ?
I try without result something like this :
db.my_emails.aggregate([ { $group : {_id : "$host", count : { $sum : 1 }}}]);
I don't know how to make the $host value without adding a new property to my records

MongoDB doesn't provide any operator like locate but you can use .mapReduce to do this:
db.collection.mapReduce(
function() {
emit(this.email.substr(this.email.indexOf('#') + 1), 1);
},
function(host, count) {
return Array.sum(count) ; },
{ out: "hosts" }
)
Then db.hosts.find().sort({ 'value': -1 }).limit(10) returns top 10 hostname:
{ "_id" : "yahoo.com", "value" : 2 }
{ "_id" : "gmail.com", "value" : 1 }

An alternative workaround would be to modify your data structure by introducing another field in your schema which holds only the domain value of the email address. This can be done with a bulk update using the Bulk API operations that give a better write response i.e. useful information about what actually happened during the update:
var bulk = db.my_emails.initializeUnorderedBulkOp(),
count = 0;
db.my_emails.find().forEach(function(doc) {
var domain = doc.email.replace(/.*#/, ""),
update = { domain: domain };
bulk.find({ "_id": doc._id }).updateOne({
"$set": update
})
count++;
if (count % 1000 == 0) {
bulk.execute();
bulk = db.my_emails.initializeUnorderedBulkOp();
}
})
if (count % 1000 != 0) { bulk.execute(); }
Bulk update response from sample:
BulkWriteResult({
"writeErrors" : [ ],
"writeConcernErrors" : [ ],
"nInserted" : 0,
"nUpserted" : 0,
"nMatched" : 3,
"nModified" : 3,
"nRemoved" : 0,
"upserted" : [ ]
})
After this update, a query on the collection db.my_emails.find().pretty() will yield:
{
"_id" : ObjectId("561618af645a64b1a70af2c5"),
"email" : "russel#gmail.com",
"domain" : "gmail.com"
}
{
"_id" : ObjectId("561618af645a64b1a70af2c6"),
"email" : "mickey#yahoo.com",
"domain" : "yahoo.com"
}
{
"_id" : ObjectId("561618af645a64b1a70af2c7"),
"email" : "john#yahoo.com",
"domain" : "yahoo.com"
}
Now, having the domain field will make it easier for the aggregation framework to give you the host count through the $sum operator in the $group pipeline. The following pipeline operation will return the desired outcome:
db.my_emails.aggregate([
{
"$group": {
"_id": "$domain",
"count": { "$sum": 1 }
}
}
])
Output:
{ "_id" : "yahoo.com", "count" : 2 }
{ "_id" : "gmail.com", "count" : 1 }

Related

Unable to print the value in json objects containing multiple array values in python

{
"d" : {
"results" : [
{
"timeAccount" : "031799ce7bc344a2bb65e7f05cb08c49", "balance" : "10.1388824", "timeAccountType" : "SA_AL_DLY", "userId" : "01000", "accountClosed" : false, "timeUnit" : "DAYS"
}, {
"timeAccount" : "bc4fb4d44c3e413d8137a59d121c74b6", "balance" : "25.347206", "timeAccountType" : "SA_AL_DLY", "userId" : "01000", "accountClosed" : false, "timeUnit" : "DAYS"
}, {
"timeAccount" : "f12279ea0d34471581aff90ad71d3f83", "balance" : "5.6249964", "timeAccountType" : "SA_AL_DLY", "userId" : "01000", "accountClosed" : false, "timeUnit" : "DAYS"
}
]
}
}
this is my json value i got through api request. I need to print all balance present there
data=r.json()
for results in data['d']:
print results['results']['balance']
i tried to iterate with the above python code but unable to
please help me with this
thanks
this is my sample output form
balance 1 : 10.1388824
balance 2 : 25.347206
balance 3 : 5.6249964
Try this code:
i = 1
for res in data['d']["results"]:
print("balance ",i,":", res["balance"])
i += 1
Iterate this way..
for res in data['d']["results"]:
print (res["balance"])

Query for : How many elements of an array are matching within a string in mongoDb

Suppose my JSON is like following:
{ "id":0,"keywords":"amount,debited,account,ticket,not,generated,now" }
{ "id":1,"keywords":"how,safe,gocash" }
{ "id":2,"keywords":"how,referral,program,gocash,works" }
If my array is like
array =["how","safe","gocash"];
then how do I get the count that while checking with first; count should be zero, with second three and with third two. (That means how many elements of an array are present in the string)
Is it possible or what approach I should adopt?
One way of solving this would require some form of modification to your schema by adding an extra field that holds the keywords in an array. This field becomes quite handy when running an aggregation pipeline to return the desired count of elements of an array that match the original string.
To add the additional field you would need the Bulk API operations to update the collection as follows:
var bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.find({"keywords": { "$exists": true, "$type": 2 }}).forEach(function(doc) {
var keywordsArray = doc.keywords.split(',');
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "keywordsArray": keywordsArray }
});
count++;
if (count % 100 == 0) {
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
}
});
if (count % 100 != 0) { bulk.execute(); }
The above creates an additional field "keywordsArray" that is a result of splitting the keywords string to an array.
After the operation your sample collection would have the documents:
/* 0 */
{
"_id" : ObjectId("561e24e9ba53a16c763eaab4"),
"id" : 0,
"keywords" : "amount,debited,account,ticket,not,generated,now",
"keywordsArray" : [
"amount",
"debited",
"account",
"ticket",
"not",
"generated",
"now"
]
}
/* 1 */
{
"_id" : ObjectId("561e24e9ba53a16c763eaab5"),
"id" : 1,
"keywords" : "how,safe,gocash",
"keywordsArray" : [
"how",
"safe",
"gocash"
]
}
/* 2 */
{
"_id" : ObjectId("561e24e9ba53a16c763eaab6"),
"id" : 2,
"keywords" : "how,referral,program,gocash,works",
"keywordsArray" : [
"how",
"referral",
"program",
"gocash",
"works"
]
}
On to the next stage, the aggregation framework pipeline, run the following pipeline operation which uses the $let, $size and $setIntersection operators to work out the the desired count result:
var array = ["how","safe","gocash"];
db.collection.aggregate([
{
"$project": {
"id": 1, "keywords": 1,
"count": {
"$let": {
"vars": {
"commonToBoth": { "$setIntersection": [ "$keywordsArray", array ] }
},
"in": { "$size": "$$commonToBoth" }
}
}
}
}
])
Sample Output:
/* 0 */
{
"result" : [
{
"_id" : ObjectId("561e24e9ba53a16c763eaab4"),
"id" : 0,
"keywords" : "amount,debited,account,ticket,not,generated,now",
"count" : 0
},
{
"_id" : ObjectId("561e24e9ba53a16c763eaab5"),
"id" : 1,
"keywords" : "how,safe,gocash",
"count" : 3
},
{
"_id" : ObjectId("561e24e9ba53a16c763eaab6"),
"id" : 2,
"keywords" : "how,referral,program,gocash,works",
"count" : 2
}
],
"ok" : 1
}

How to match field with another field of sub document in Mongo

mlb_players is a collection having dataId as a field and a teamData as another sub document containing teamId and dataId.
Equivalent MySql: SELECT p.playerId FROM mlb_players p INNER JOIN mlb_teams t ON p.dataId=t.dataId
I want to convert this mysql query into equivalent of mongoDB.
Currently I am using below mongo query but giving an error:
db.getCollection('mlb_players').find({$where: "this.dataId == this.teamData.dataId"});
But it is giving me 0 records however with the same data in mysql is giving my 50 records.
For reference, the 1 of the document looks like:
{
"_id" : ObjectId("55574d05e03ce60f64797432"),
"dataId" : "87c484cc-e672-4a34-ab02-cc29101d8a52",
"teamId" : "TB",
"posId" : "SS",
"playerName" : "Nick Franklin",
"playerNameAbbr" : "N. Franklin",
"playerJersey" : 2,
"playerStatus" : "D",
"playerSalary" : 5500,
"salarySuggest" : 0,
"playerProj" : 0.0,
"playerAvg" : 0.0,
"manualSalary" : 1,
"playerImg" : 0,
"hasImage" : 0,
"used" : 0,
"teamData" : {
"_id" : ObjectId("554239eb4e7235204100002a"),
"teamId" : "TB",
"dataId" : "87c484cc-e672-4a34-ab02-cc29101d8a52",
"teamName" : "Tampa Bay Rays",
"teamAbbr" : "TB"
}
}
Please suggest. Thanks in Advance
You can use aggregation and the $eq operator
db.mlb_players.aggregate(
[
{ $project: { "m": { $eq: [ "$dataId", "$teamData.dataId" ]}}},
{ $match: { "m": true }},
{ $project: { "_id": 1 }}
]
)
In first sentence you wrote that you have players collection, but your query looks for players in mlb_players collection. Maybe that is the purpose of not getting any documents...

Generating Mongo query from MySQL query

I have been using the following MySQL command to construct a heatmap from log data. However, I have a new data set that is stored in a Mongo database and I need to run the same command.
select concat(a.packages '&' b.packages) "Concurrent Packages",
count(*) "Count"
from data a
cross join data b
where a.packages<b.packages and a.jobID=b.jobID
group by a.packages, b.packages
order by a.packages, b.packages;
Keep in mind that the tables a and b do not exist prior to the query. However, they are created from the packages column of the data table, which has jobID as the field which I want to check for matches. In other words if two packages are within the same job I want to add an entry to the concurrent usage count. How can I generate a similar query in Mongo?
This is not a "join" of different documents; it is an operation within one document, and can be done in MongoDB.
You have a SQL TABLE "data" like this:
JobID TEXT,
package TEXT
The best way to store this in MongoDB will be a collection called "data", containing one document per JobID that contains an array of packages:
{
_id: <JobID>,
packages: [
"packageA",
"packageB",
....
]
}
[ Note: you could also implement your data table as only one document in MongoDB, containing an array of jobs which contain each an array of packages. This is not recommended, because you might hit the 16MB document size limit and nested arrays are not (yet) well supported by different queries - if you want to use the data for other purposes as well ]
Now, how to get a result like this ?
{ pair: [ "packageA", "packageB" ], count: 20 },
{ pair: [ "packageA", "packageC" ], count: 11 },
...
As there is no built-in "cross join" of two arrays in MongoDB, you'll have to program it out in the map function of a mapReduce(), emitting each pair of packages as a key:
mapf = function () {
that = this;
this.packages.forEach( function( p1 ) {
that.packages.forEach( function( p2 ) {
if ( p1 < p2 ) {
key = { "pair": [ p1, p2 ] };
emit( key, 1 );
};
});
});
};
[ Note: this could be optimized, if the packages arrays were sorted ]
The reduce function is nothing more than summing up the counters for each key:
reducef = function( key, values ) {
count = 0;
values.forEach( function( value ) { count += value } );
return count;
};
So, for this example collection:
> db.data.find()
{ "_id" : "Job01", "packages" : [ "pA", "pB", "pC" ] }
{ "_id" : "Job02", "packages" : [ "pA", "pC" ] }
{ "_id" : "Job03", "packages" : [ "pA", "pB", "pD", "pE" ] }
we get the following result:
> db.data.mapReduce(
... mapf,
... reducef,
... { out: 'pairs' }
... );
{
"result" : "pairs",
"timeMillis" : 443,
"counts" : {
"input" : 3,
"emit" : 10,
"reduce" : 2,
"output" : 8
},
"ok" : 1,
}
> db.pairs.find()
{ "_id" : { "pair" : [ "pA", "pB" ] }, "value" : 2 }
{ "_id" : { "pair" : [ "pA", "pC" ] }, "value" : 2 }
{ "_id" : { "pair" : [ "pA", "pD" ] }, "value" : 1 }
{ "_id" : { "pair" : [ "pA", "pE" ] }, "value" : 1 }
{ "_id" : { "pair" : [ "pB", "pC" ] }, "value" : 1 }
{ "_id" : { "pair" : [ "pB", "pD" ] }, "value" : 1 }
{ "_id" : { "pair" : [ "pB", "pE" ] }, "value" : 1 }
{ "_id" : { "pair" : [ "pD", "pE" ] }, "value" : 1 }
For more information on mapReduce consult: http://docs.mongodb.org/manual/reference/method/db.collection.mapReduce/ and http://docs.mongodb.org/manual/applications/map-reduce/
You can't. Mongo doesn't do joins. Switching from SQL to Mongo is a lot more involved than migrating your queries.
Typically, you would include all the pertinent information in the same record (rather than normalize the information and select it with a join). Denormalize!

Expressing this SQL as Mongo Query

I want to figure out the most active users on my site.
I have records of the form
{
"_id" : "db1855b0-f2f4-44eb-9dbb-81e27780c796",
"createdAt" : 1360497266621,
"profile" : { "name" : "test" },
"services" : { "resume":
{ "loginTokens" : [{
"token" : "82c01cb8-796a-4765-9366-d07c98c64f4d",
"when" : 1360497266624
},
{
"token" : "0e4bc0a4-e139-4804-8527-c416fb20f6b1",
"when" : 1360497474037
} ]
},
"twitter" : {
"accessToken" : "9314Sj9kKvSyosxTWPY5r57851C2ScZBCe",
"accessTokenSecret" : "UiDcJfOfjH7g9UiBEOBs",
"id" : 2933049,
"screenName" : "testname"
}
}
}
I want to be able to select users and order by the number of loginTokens.
In MySQL it would be something like:
SELECT id, COUNT(logins) AS logins
FROM users
GROUP BY id ORDER BY logins DESC
I've tried this on querymongo.com and i got an error (can't work with aliases/ cant order by non-column names)
What's the Mongo way to do this?
Thanks!
I just converted:
SELECT id, COUNT(logins)
FROM users
GROUP BY id
To:
db.users.group({
"key": {
"id": true
},
"initial": {
"countlogins": 0
},
"reduce": function(obj, prev) {
prev.countlogins++;
}
});
Hope this helps
Here is an example of what you said using the aggregation framework:
db.users.aggregate([
{$unwind: '$services.resume.loginTokens'},
{$group: {_id: '$_id', logins: {$sum: 1}}},
{$sort: {logins: -1}}
])
This should do the trick.