Andrey Andrey - 1 year ago 49
SQL Question

Best way to select records that having more recent date from grouped duplicates

I have table like this:

CREATE TABLE [dbo].[TestToDelete](
[id] [int] NULL,
[Email] [nvarchar](50) NULL,
[RawEmail] [nvarchar](50) NULL,
[Status] [tinyint] NULL,
[ValidationDate] [datetime] NULL
) ON [PRIMARY]

INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (1, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 14:00:30.300' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (2, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 14:00:52.347' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (3, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 14:00:58.117' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (4, N'[email protected]', N'[email protected]', 22, CAST(N'2017-02-07 14:01:08.360' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (5, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 14:01:21.783' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (6, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 14:01:29.310' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (7, N'[email protected]', N'[email protected]', 22, CAST(N'2017-02-07 14:01:37.050' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (8, NULL, N'[email protected]', 0, CAST(N'2017-02-07 14:02:10.643' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (9, NULL, N'[email protected]', 0, CAST(N'2017-02-07 14:02:22.160' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (10, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 15:30:01.637' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (11, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 15:30:06.657' AS DateTime))
INSERT [dbo].[TestToDelete] ([id], [Email], [RawEmail], [Status], [ValidationDate]) VALUES (12, N'[email protected]', N'[email protected]', 11, CAST(N'2017-02-07 15:30:12.160' AS DateTime))


And I need to select records (
Email
,
RawEmail
and
Status
fields) that occurs 3 or more times and have more recent date. In this table it is



and not



because has more recent date

The query to do this selection:

select * from
(
select email, rawEmail, Status,
ROW_NUMBER() OVER(PARTITION BY rawEmail ORDER BY vdate DESC) num
from
(select max([ValidationDate]) vdate, email, rawEmail, Status
from TestToDelete where status in (11, 22)
group by rawEmail, email, status
having count(*) > 2
) tmp
)final where num = 1


Is it possible to do this with less subqueries (not 3 like now)?




UPDATE:
Expected output for 3 or more occurs:



Expected output for 2 or more occurs:


Answer Source

Using with (common table expression), row_number(), and count() over()

If we can partition the count() by RawEmail, Status then:

;with cte as (
    select
        rn = row_number() over (
            partition by RawEmail
            order by ValidationDate desc
            )
        , cnt = count(*) over (
            partition by RawEmail, status
            )
        , *
    from TestToDelete
    where status in (11, 22)
    )
select * 
from cte o 
where o.rn=1 
  and o.cnt > 2

results: http://rextester.com/WYVZ86149

+----+-----+----+-----------+----------+--------+---------------------+
| rn | cnt | id |   Email   | RawEmail | Status |   ValidationDate    |
+----+-----+----+-----------+----------+--------+---------------------+
|  1 |   7 | 12 | [email protected] | [email protected] |     11 | 07.02.2017 15:30:12 |
+----+-----+----+-----------+----------+--------+---------------------+

If we can not partition the count() by RawEmail, Status then:

;with cte as (
    select
        rn = row_number() over (
            partition by RawEmail
            order by ValidationDate desc
            )
        , cnt = count(*) over (
            partition by RawEmail
            )
        , *
    from TestToDelete
    where status in (11, 22)
    )
select * 
from cte o 
where o.rn=1 
  and o.cnt > 2
  and exists (
    select 1
      from cte i 
      where i.RawEmail = o.RawEmail
        and i.Email != o.Email
      )

results: http://rextester.com/YTQ30810

+----+-----+----+-----------+----------+--------+---------------------+
| rn | cnt | id |   Email   | RawEmail | Status |   ValidationDate    |
+----+-----+----+-----------+----------+--------+---------------------+
|  1 |   7 | 12 | [email protected] | [email protected] |     11 | 07.02.2017 15:30:12 |
+----+-----+----+-----------+----------+--------+---------------------+
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download