Smandoli Smandoli - 1 month ago 7
SQL Question

SQL for islands-and-gaps: islands can overlap

I have robots with certificates. There are two kinds of certificates. For each kind of certificate (identified by

Certif_ID
), for each robot I need the most recent certified date-span.

Update for clarity: Date-spans that do not overlap but are contiguous are treated as a single span. See the first two records in the sample table that is at the top of the code.

Date-spans may overlap! These must be treated as a single span. This is where I'm having a problem.

in SQL Server 2012, run this code as-is to see what's happening.

BEGIN -- #certif_span
IF OBJECT_ID('TEMPDB..#certif_span') IS NOT NULL DROP TABLE #certif_span;

CREATE TABLE #certif_span
( Robot_ID CHAR(3)
, Certif_ID SMALLINT
, d_Start SMALLDATETIME
, d_End SMALLDATETIME );

INSERT INTO #certif_span VALUES ('210', '1', '2000-01-01', '2001-02-02');
INSERT INTO #certif_span VALUES ('210', '1', '2001-02-03', '2001-12-31');
INSERT INTO #certif_span VALUES ('210', '1', '2000-01-01', '2000-12-31');
INSERT INTO #certif_span VALUES ('880', '1', '2001-01-01', '2001-12-31');
INSERT INTO #certif_span VALUES ('880', '1', '2002-02-02', '2003-02-01');
INSERT INTO #certif_span VALUES ('880', '1', '2003-01-01', '2004-12-31'); -- *
INSERT INTO #certif_span VALUES ('880', '7', '2010-05-05', '2011-05-04');
INSERT INTO #certif_span VALUES ('880', '7', '2011-05-05', '2012-02-10');
INSERT INTO #certif_span VALUES ('880', '7', '2013-03-03', '2013-04-04');
INSERT INTO #certif_span VALUES ('880', '7', '2013-04-01', '2013-05-05'); -- *
-- * This line has dates that overlap with the line above
END

SELECT Robot_ID
, Certif_ID
, d_Start = FORMAT(d_Start, 'yyyy-MM-dd')
, d_End = FORMAT(d_End, 'yyyy-MM-dd')
, commentary = 'Here is the raw data'
FROM #certif_span AS cs
ORDER BY Robot_ID
, Certif_ID
, d_End

IF OBJECT_ID('TEMPDB..#prac_date_span') IS NOT NULL DROP TABLE #prac_date_span;

SELECT DISTINCT
cs.Robot_ID
, cs.Certif_ID
, cs.d_Start
, cs.d_End
INTO
--DROP TABLE --SELECT * FROM
#prac_date_span
FROM
#certif_span AS cs
GROUP BY
cs.Robot_ID
, cs.Certif_ID
, cs.d_Start
, cs.d_End
ORDER BY 1, 2, 3;

BEGIN

IF OBJECT_ID('TEMPDB..#prac_date_span_grp') IS NOT NULL
DROP TABLE #prac_date_span_grp;

WITH cte as (
SELECT
a.Robot_ID, a.Certif_ID
, a.d_Start, a.d_End
FROM
#prac_date_span a
LEFT JOIN #prac_date_span b
ON a.Robot_ID = b.Robot_ID
AND b.Certif_ID = a.Certif_ID
AND a.d_Start - 1 = b.d_End
WHERE
b.Robot_ID IS NULL
UNION ALL -----------------------------
SELECT
a.Robot_ID, a.Certif_ID
, a.d_Start, b.d_End
FROM
cte a
JOIN
#prac_date_span b
ON a.Robot_ID = b.Robot_ID
AND b.Certif_ID = a.Certif_ID
AND b.d_Start - 1 = a.d_End
)
SELECT
Robot_ID
, Certif_ID
, d_Start
, d_End = MAX(d_End)
INTO
--drop table --select * from
#prac_date_span_grp
FROM cte
GROUP BY Robot_ID, Certif_ID, d_Start
ORDER BY Robot_ID, Certif_ID;
END

SELECT
Robot_ID
, Certif_ID
, d_Start = FORMAT(d_Start, 'yyyy-MM-dd')
, d_End = FORMAT(d_End, 'yyyy-MM-dd')
, commentary = 'Here is the grouped data (flawed)'
FROM #prac_date_span_grp

SELECT
Robot_ID
, Certif_ID
, d_Start = FORMAT(MAX(d_Start), 'yyyy-MM-dd')
, d_End = FORMAT(MAX(d_End), 'yyyy-MM-dd')
, commentary = 'Final result: Start date ' +
CASE FORMAT(MAX(d_Start), 'yyyy-MM-dd')
WHEN '2003-01-01' THEN 'should be 2002-02-02'
WHEN '2013-04-01' THEN 'should be 2013-03-03'
ELSE 'good' END
FROM #prac_date_span_grp
GROUP BY Robot_ID, Certif_ID


The final result should be:

Robot_ID Certif_ID d_Start d_End
210 1 2000-01-01 2001-12-31
880 1 2002-02-02 2004-12-31
880 7 2013-03-03 2013-05-05


I've been fiddling with the date comparisons. In this bit from the
cte
, the
-1
looks like it allows for a one-day stagger in date-spans:

AND b.Certif_ID = a.Certif_ID
AND a.d_Start - 1 = b.d_End
...
AND b.Certif_ID = a.Certif_ID
AND b.d_Start - 1 = a.d_End


I feel certain this is the point that needs fixing. I've tried changing the date compare to
>=
. (This requires me to deal with max recursion.) The grouping changes, but is not correct.

Answer

This is not a simple task. I hope this will answer the problem.

Declare @certif_span TABLE(Robot_ID CHAR(3), Certif_ID SMALLINT, StartDate date, EndDate date);

  INSERT INTO @certif_span VALUES ('210', '1', '2000-01-01', '2001-02-02');
  INSERT INTO @certif_span VALUES ('210', '1', '2001-02-03', '2001-12-31');
  INSERT INTO @certif_span VALUES ('210', '1', '2000-01-01', '2000-12-31');
  INSERT INTO @certif_span VALUES ('880', '1', '2001-01-01', '2001-12-31');
  INSERT INTO @certif_span VALUES ('880', '1', '2002-02-02', '2003-02-01');
  INSERT INTO @certif_span VALUES ('880', '1', '2003-01-01', '2004-12-31'); -- *
  INSERT INTO @certif_span VALUES ('880', '7', '2010-05-05', '2011-05-04');
  INSERT INTO @certif_span VALUES ('880', '7', '2011-05-05', '2012-02-10');
  INSERT INTO @certif_span VALUES ('880', '7', '2013-03-03', '2013-04-04');
  INSERT INTO @certif_span VALUES ('880', '7', '2013-04-01', '2013-05-05'); -- *

;with Src as(
SELECT  ROW_NUMBER() Over(Partition by Robot_ID, Certif_ID order by StartDate, EndDate) as RN
                ,a.*
        FROM @certif_span as a
)

  , Islands as(
        SELECT RN, Robot_ID, Certif_ID, StartDate, EndDate, 0 as islandNo, EndDate AS MovingEnd
        FROM Src as a WHERE a.RN=1
        UNION ALL
        SELECT a.RN, a.Robot_ID, a.Certif_ID, a.StartDate, a.EndDate
             , b.islandNo + CASE WHEN DATEDIFF(d, a.StartDate, b.MovingEnd)>=-1 THEN 0 ELSE 1 END as IslandNO
             , CASE WHEN a.EndDate>b.MovingEnd THEN a.EndDate ELSE b.MovingEnd END as MovingEnd
        FROM Src as a 
        INNER JOIN Islands as b on a.Robot_ID=b.Robot_ID and a.Certif_ID=b.Certif_ID and a.RN=b.RN+1
    )   --  SELECT * FROM Islands order by Robot_ID, Certif_ID, IslandNo

  , LastIsland as(
        SELECT Robot_ID, Certif_ID, islandNo, MIN(StartDate) as startDate, MAX(EndDate) as EndDate
              ,ROW_NUMBER() over(partition by Robot_ID, Certif_ID order by IslandNO desc) as RN
        FROM Islands
        Group by Robot_ID, Certif_ID, islandNo
)
    SELECT Robot_ID, Certif_ID, startDate, EndDate 
    FROM   LastIsland
    where  RN=1