nik nik - 18 days ago 8
C# Question

Return first x items in a group of groups

I'm querying a datatable and I seem stuck on selecting a group of groups.

This code

var grouping = table.AsEnumerable()
.Where(x => curveids.Contains(x.Field<short>("CurveID")) && x.Field<DateTime>("Timestamp").Hour >= hour && x.Field<DateTime>("Timestamp").Hour < (hour + 1))
.GroupBy(x => x.Field<DateTime>("Timestamp")).Where(x => x.Select(y => y["CurveID"]).Count() == curveids.Count);


Groups by timestamp and returns a group of x curves, where x = curveid.Count(). It contains 5000ish groups.

However for each day there can be more than one timestamp.

int nrdays = grouping.GroupBy(z => z.Key.Date).Count();


tells me there are 255 distinct days.

I would now like to group this again, but not by time stamp but by calendar day and then take the first (as in earliest) group for each day. I tried this:

var grouping2 = grouping.GroupBy(z => z.Key.Date).OrderBy(a => a.Key).Take(curveids.Count);


but this only returns 4 groups and I dont get why?
It should return 255 groups with each of them containing the same timestamp and x curveids, so x*255 record sets.

The datatable has 3 columns, Timestamp (DateTime), CurveID(short), Price(double).

UPDATE

As requested by Mr Skeet a full example:

public class listprx
{
public DateTime timestamp;
public int curveID;
public double prx;
}

static void Main(string[] args)
{
var data = new List<listprx>();

// populating data
for (int i = 0; i < 50000; i++)
{
Random rand = new Random(i);
var tempdt = new DateTime(2016, rand.Next(1, 12), rand.Next(1, 29), rand.Next(1, 23), rand.Next(1, 59), 0);

if(i % 3 == 0)
{
data.Add(new listprx { timestamp = tempdt, curveID = 1, prx = rand.Next(1,50)});
data.Add(new listprx { timestamp = tempdt, curveID = 2, prx = rand.Next(1, 50) });
}
else if (i % 5 == 0)
{
data.Add(new listprx { timestamp = tempdt, curveID = 1, prx = rand.Next(1, 50) });
}
else
{
data.Add(new listprx { timestamp = tempdt, curveID = 1, prx = rand.Next(1, 50) });
data.Add(new listprx { timestamp = tempdt, curveID = 2, prx = rand.Next(1, 50) });
data.Add(new listprx { timestamp = tempdt, curveID = 3, prx = rand.Next(1, 50) });
}
}

// setting hour criteria
int hour = 16;
int nrcurves = 3;

// grouping by timestamp and only take those where all curves are there, (as close to the desired time as possible
var grouping = data.Where(x => x.timestamp.Hour >= hour && x.timestamp.Hour < (hour + 1))
.GroupBy(x => x.timestamp).Where(x => x.Select(y => y.curveID).Count() == nrcurves);

// Grouping by day and take only the time stamp that is closest to the hour
// this fails
var grouping2 = grouping.GroupBy(z => z.Key.Date).OrderBy(a => a.Key).Take(nrcurves);

Console.WriteLine("Nr of timestamps with all curves {0}, nr of days {1}, nr of groups in second group {2}, expected same as nr days"
, grouping.Count(), grouping.GroupBy(z => z.Key.Date).Count(), grouping2.Count());

Console.ReadLine();
}


UPDATE 2

I have removed the random element and simplified further:

public class listprx
{
public DateTime timestamp;
public int curveID;
}

static void Main(string[] args)
{
var data = new List<listprx>();

// populating data
var tempdt = new DateTime(2016, 4, 6, 16, 1, 0);

for (int i = 0; i < 4; i++)
{
if (i == 2)
{
tempdt = tempdt.AddDays(1);
}

if(i % 2 == 0 )
{
data.Add(new listprx { timestamp = tempdt, curveID = 1});
}
else
{
data.Add(new listprx { timestamp = tempdt, curveID = 1});
data.Add(new listprx { timestamp = tempdt, curveID = 2});
}

tempdt = tempdt.AddMinutes(i+1);
}

// setting hour criteria
int hour = 16;
int nrcurves = 2;

//grouping by timestamp and only take those where all curves are there, (as close to the desired time as possible
var grouping = data.Where(x => x.timestamp.Hour >= hour && x.timestamp.Hour < (hour + 1))
.GroupBy(x => x.timestamp).Where(x => x.Select(y => y.curveID).Count() == nrcurves);

//Grouping by day and take only the time stamp that is closest to the hour
//this fails
var grouping2 = grouping.GroupBy(z => z.Key.Date).OrderBy(a => a.Key).Take(nrcurves);

Console.WriteLine("Nr of timestamps with all curves {0}, nr of days {1}, nr of groups in second group {2}, expected same as nr days"
, grouping.Count(), grouping.GroupBy(z => z.Key.Date).Count(), grouping2.Count());

Console.ReadLine();
}


The expected end result is:

Timestamp CurveID
------------------------
6/4/16 16:02 1
6/4/16 16:02 2
7/4/16 16:06 1
7/4/16 16:06 2

Answer

Edited answer working on your example.

Ok, I went trought your example and fixed some bugs and my answer. Let's clear code a bit and comment what went wrong where.

Our models will be

public class Curve
{
    public int CurveID { get; set; }
    public DateTime Timestamp { get; set; }
}

public class CurveGroup
{
    public DateTime Timestamp { get; set; }
    public IEnumerable<Curve> Curves { get; set; }
}

next is function to generate test data:

public static List<Curve> GetData()
{
    var data = new List<Curve>();
    var startTime = new DateTime(2016, 4, 6, 16, 1, 0);

    for (int i = 0; i < 4; i++)
    {
        if (i == 2)
        {
           //startTime.AddDays(1); - this line does nothing, DateTime is an immutable struct so all function changing its value returns a new copy
           startTime = startTime.AddDays(1);
        }

        if (i % 2 == 0)
        {
           data.Add(CreateNewCurve(startTime, 1));
        }
        else
        {
           data.Add(CreateNewCurve(startTime, 1));
           data.Add(CreateNewCurve(startTime, 2));
        }

        //startTime.AddMinutes(i + 1); same issue as above
        startTime = startTime.AddMinutes(i + 1);
    }

    return data;
}

public static Curve CreateNewCurve(DateTime time, int curveID)
{
    return new Curve()
    {
        Timestamp = time,
        CurveID = curveID
    };
}

and here goes main function

static void Main(string[] args)
{
    var data = GetData();

    int hour = 16;
    int totalCurveCount = 2;

    var grouping = data
           .Where(x => x.Timestamp.Hour >= hour && x.Timestamp.Hour < (hour + 1))
           .GroupBy(x => x.Timestamp)
           .Where(x => x.Count() == totalCurveCount); //there is no need to select curveId like in your code: Where(x => x.Select(y => y.curveID).Count() == nrcurves);

    var grouping2 = grouping
           .GroupBy(x => x.Key.Date)
           .Select(x =>
                new CurveGroup
                {
                   Timestamp = x.Key,
                   Curves = x.OrderBy(c => c.Key).Take(totalCurveCount).SelectMany(c => c)
                }
           );


    foreach (var g in grouping2)
    {
        foreach (var c in g.Curves)
        {
            Console.WriteLine(c.Timestamp);
            Console.WriteLine(c.CurveID);
        }
    }
}

this returns expected results.

Your code failed because your second grouping is not taking (Take(nrcurves)) values in groups but groups themselves. So instead of returning 255 groups with 2 values in each you return 2 groups with all values in them.

Hope this fixes your issue.