如何计算所有工作的独特天数总数 - 无重叠天数计算两次答案

【问题标题】：How to Calculate the Total Unique Days Employed for All Jobs - No overlap days counted twice如何计算所有工作的独特天数总数 - 无重叠天数计算两次
【发布时间】：2022-06-18 05:19:57
【问题描述】：

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
      (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)

--SELECT * FROM #DaysPerJob dpj ORDER BY dpj.GroupID, dpj.StartDate, dpj.EndDate

/* Days Per Job Calculations - Attempts */
SELECT dj.GroupID, dj.JobDesc, dj.StartDate, dj.EndDate
    , LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.GroupID, dj.StartDate, dj.EndDate) AS PreviousJobEndDate
    , DATEDIFF(DAY, dj.StartDate, IsNull(dj.EndDate, GetDate())) AS daysPerJob
FROM #DaysPerJob dj
ORDER BY dj.GroupID, dj.StartDate, dj.EndDate

我如何获得每组独特天数的总和？

上面的 SQL 将为您提供一个工作记录表。每个职位都有一个开始日期，但并非所有职位都有一个结束日期，这意味着他们仍然受雇于该职位。

我一直在努力解决的问题是如何计算独特的工作天数。使用 DATEDIFF 函数简单地计算每个作业的天数非常容易，但是我目前无法计算同一范围内的其他作业，因为它会将这些天数计算两次。

我按开始日期排序，然后使用 LAG 将最后一个作业的结束日期与下一个作业的开始日期进行比较。如果当前工作开始日期

但是上述情况存在问题...如果我的上一份工作没有结束日期，或者如果最后一份工作的结束日期也是 > 当前的工作结束日期怎么办？这意味着整个当前工作与上一个工作在同一范围内，因此我们不应计算任何天数，并且天数将变为 0，因此当计算总天数时，它不会计算该天数工作。这是最后一个问题，我不知道是哪个问题导致我在 Stack Overflow 上发布这个问题。

/* Some SQL below of some things I have tried */
/* Days Per Job Calculations - Attempts */
SELECT dj.GroupID, dj.JobDesc, dj.StartDate, dj.EndDate
    , LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.GroupID, dj.StartDate, dj.EndDate) AS PreviousJobEndDate
    
    /* Check if next record is within same date range.  The idea here is if the job is within the
     | same Range we replace the current Jobs Start Date with the last Jobs End Date
    */
    , CASE WHEN ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
        AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

        THEN  IsNull( ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ), GetDate() )
        ELSE dj.StartDate

      END AS StartDateForSet
    /* The below CASE is the same logic as the above CASE but just an output stating if the
     | next job was found to be within the same range or if a NEW Set has begun.
    */
    , CASE WHEN ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
        AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

        THEN 'InRange' 
        ELSE 'NewSet'

      END AS withinRangeCheck

    , DATEDIFF(DAY, dj.StartDate, IsNull(dj.EndDate, GetDate())) AS daysPerJob
    /* This is the field that I want to use to eventually SUM using GROUPing and aggregate functions however I first 
     | need to get it to correctly output the unique days.  If the current job falls within the previous jobs date
     | range the idea is that this calculation would account for that and move the End Date accordingly so it either
     |  does NOT count any days within the new job or counts the trailing days should the job end date fall after the previous job.
    */
    , DATEDIFF(DAY  /* StartDate */
        ,     (CASE WHEN( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) >= dj.StartDate 
                AND ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ) <= dj.EndDate

                THEN IsNull( ( LAG(dj.EndDate) OVER (PARTITION BY dj.GroupID ORDER BY dj.StartDate, dj.EndDate) ), GetDate() )
                ELSE dj.StartDate

                END 
                ) 
            /* EndDate If Null Use Current Date */
            , IsNull(dj.EndDate, GetDate())

      ) AS DaysEmployedWithinSet

FROM #DaysPerJob dj
ORDER BY dj.GroupID, dj.StartDate, dj.EndDate

|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|-|-|-|-|-|-|-|-|-|-|-|

此问题的解决方案如下，基于选择的正确发布答案

|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|-|-|-|-|-|-|-|-|-|-|-|

我真的认为这个问题会有更多答案，但这并不容易……至少这不是我的问题，我的同事也无法回答。无论如何，这个问题有两个答案。一个帖子，无论它多么接近，都没有准确计算出受雇天数。我对数据和 Excel 中的计算进行了三次检查，并根据本示例中提供的数据集，总计应该如下所示在 SQL Server 版本中使用递归 CTE 创建日期表。

/* SUM Unique Days in Multiple Date Range Records (SQL Server).sql
 | SQL Server Example 
 | Desc: The below shows how to obtain the unique days employed.  Meaning we don't count the 
 |         same day twice should an individual be employed at more than job at any given time.
*/

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
      (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Starter', '2021-11-21', '2021-12-13')
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)
;

/* Using a Recursive CTE to produce a dates table to later be JOINed on */
WITH Dates(date) AS
(
    SELECT MIN(StartDate) AS date
    FROM #DaysPerJob

    UNION ALL

    SELECT DATEADD(DAY, 1, date)
    FROM Dates
    WHERE date < GetDate()
)
, ranked AS
(   /* Needing to rank each job record in order to later remove the overlapping days when employed at more than one job at one time. */
    SELECT j.*, d.*
        , ROW_NUMBER() OVER (PARTITION BY j.GroupID, d.date ORDER BY j.GroupID, j.StartDate, IsNull(j.EndDate, GetDate())) AS ranker
    FROM Dates d
        LEFT JOIN #DaysPerJob j ON j.StartDate <= d.date
                                    AND IsNull(j.EndDate, GetDate()) >= d.date
    WHERE j.GroupID IS NOT NULL /* This filter removes all days in the Dates table where there was no employment */
        --AND j.GroupID = 26208  --23293

    --ORDER BY d.date, j.StartDate, IsNull(j.EndDate, GetDate()), j.GroupID
    --OPTION (MaxRecursion 0) 
)

    /* Non Aggregate Data - UnComment to view */
    /*
    SELECT * FROM ranked r WHERE r.GroupID IS NOT NULL
    ORDER BY r.date, r.StartDate, IsNull(r.EndDate, GetDate()), r.GroupID
    OPTION (MaxRecursion 0)
    */

/* Aggregated Data */
SELECT r.GroupID, COUNT(*) AS daysEmployed, MIN(date) AS minStartDate, MAX(date) AS maxEndDate
    , DATEDIFF(DAY, MIN(date), MAX(date)) AS TotalDaysInRange
    /* To get total number of days NOT employed we simply take the TotalDaysInRange and subtract the daysEmployed */
    , DATEDIFF(DAY, MIN(date), MAX(date)) - COUNT(*) AS unEmployedDays
FROM ranked r
WHERE r.ranker = 1
GROUP BY r.GroupID
ORDER BY r.GroupID
OPTION (MaxRecursion 0) /* The default MaxRecursion setting is 100. Generating more than 100 dates using this method will require the Option (MaxRecursion N) segment of the query, where N is the desired MaxRecursion setting. Setting this to 0 will remove the MaxRecursion limitation altogether */

按 GroupID 分组的总数的屏幕截图：

根据截至 22 年 2 月 6 日发布的今天的屏幕截图，总数为：

GroupID 23293 : 335 天工作

GroupID 26208 : 929 天工作

这篇 SO Post 提供了有关如何填充日期表的出色示例，其中一些答案无需使用 Option (MaxRecursion) 即可完成这一壮举

Get a list of dates between two dates using a function

【问题讨论】：

标签： sql-server

【解决方案1】：

我无权访问 SqlServer 实例来测试它，所以这是 SQLite 语法，但我认为转换它应该不难。

我采用的方法基本上是使用“日期”表，然后将 DaysPerJob 表加入其中，以便获取 GroupId 处于活动状态的每一天的记录。然后，您只需根据个人日期和 groupId 进行排名，以过滤掉“重叠”的工作天数。

/* Just using a recursive CTE to create a DATE table */
/* If you have an existing date table, could use that instead */
WITH dates(date) AS (
  SELECT
    MIN(StartDate)
  FROM DaysPerJob
  UNION ALL
  SELECT
    DATE(date, '+1 day')
  FROM dates
  WHERE date < date()
)
, ranked AS (
  SELECT
    d.date
  , j.StartDate
  , j.EndDate
  , j.GroupID
  , j.JobDesc
  , ROW_NUMBER() OVER (PARTITION BY d.date, j.GroupID) AS ranker
  FROM dates d
  LEFT JOIN DaysPerJob j
    ON date(j.StartDate) <= date(d.date)
  AND ifnull(j.EndDate, date()) >= date(d.date)
  WHERE j.GroupID IS NOT NULL
)
SELECT COUNT(*) AS days_worked, GroupID
FROM ranked r
WHERE r.ranker = 1
GROUP BY GroupID;

【讨论】：

这个问题的答案非常直观，无需实际应用任何日期函数。生成日期表后，您可以简单地计算使用排名器删除重复记录后的唯一天数。这很简单。

【解决方案2】：

这是经过一段时间整理数据后得出的另一个答案。请原谅我，我把它放在一个更容易使用的格式中。这应该可以。

/* Data Setup */
DROP TABLE IF EXISTS #DaysPerJob;
CREATE TABLE #DaysPerJob
(
    GroupID INT, JobDesc VARCHAR(100), StartDate DATE, EndDate DATE
)
INSERT INTO #DaysPerJob(GroupID, JobDesc, StartDate, EndDate) 
VALUES
        (23293, 'Food Prep', '2017-03-01', '2017-07-17')
    , (23293, 'Finisher', '2021-11-19', NULL)
    , (23293, 'Cashier', '2021-12-06', '2021-12-10')
    , (26208, '3rd SHift Stocker', '2019-09-25', '2020-11-05')
    , (26208, 'Order Fulfillment Assoc', '2020-08-05', '2021-04-16')
    , (26208, 'Customer Service Rep', '2021-05-10', '2021-10-15')
    , (26208, 'Delivery Driver', '2021-11-15', NULL)
    , (26208, 'Another Job', '2022-02-23', '2022-03-02')
    , (26208, 'Same Day Job Start as Prev Job End', '2022-03-01', NULL)

--SELECT * FROM #DaysPerJob dpj ORDER BY dpj.GroupID, dpj.StartDate, dpj.EndDate

/* Days Per Job Calculations - Attempts */

;WITH GapsMarked AS
(
    --Mark the start of an (null) value island within a group and rank the data for window functions below and/or joining back
    SELECT 
        GroupID, JobDesc,StartDate, EndDate,        
        Island = CASE WHEN EndDate IS NULL THEN 1 ELSE 0 END,
        RowInGroup=ROW_NUMBER() OVER(PARTITION BY GroupID ORDER BY StartDate, EndDate)      
    FROM 
        #DaysPerJob
)
,VirtualGroups AS
(
    --Complete the IsIsland within group calculation started above
    SELECT 
        *,
        IsIsland = SUM(Island) OVER (PARTITION BY GroupID ORDER BY RowInGroup ROWS UNBOUNDED PRECEDING)     
    FROM 
        GapsMarked
)
,MinEndDateInIsland AS
(
    --This grabs the Min End Date to compare to the start date of each consecutive island record
    SELECT 
        V1.GroupID, V1.RowInGroup,              
        EndDateOrMinOverlapped=CASE WHEN MIN(V2.EndDate) >= V1.StartDate THEN   MIN(V2.EndDate)  ELSE V1.EndDate END        
    FROM 
        VirtualGroups V1
        LEFT OUTER JOIN VirtualGroups V2 ON V2.GroupID = V1.GroupID AND V2.RowInGroup <= V1.RowInGroup AND V2.IsIsland=0 
    GROUP BY
        V1.GroupID, V1.RowInGroup,V1.StartDate, V1.EndDate
)
--Final output
SELECT 
    G.GroupID, G.JobDesc, G.StartDate, G.EndDate,
    DayCalc=CASE WHEN G.IsIsland=0 THEN DATEDIFF(DAY, G.StartDate,N.EndDateOrMinOverlapped) ELSE NULL END
FROM
    MinEndDateInIsland N
    INNER JOIN VirtualGroups G ON G.GroupID = N.GroupID AND G.RowInGroup= N.RowInGroup
ORDER BY 
    G.GroupID, G.RowInGroup

【讨论】：

我当然看到了为此付出的努力，而您采用的方法也是我所采用的方法。然而，在看到这个问题的另一个答案之后，它非常直观，并且不需要实际应用任何日期函数。生成日期表后，您可以简单地计算使用排名器删除重复记录后的唯一天数。我很想给你的答案一个UpVote，但是总数不正确，或者至少我无法找到一种方法来使它们准确。
感谢您的评论。我同意，使用日期表是一种更优雅的解决方案，并且需要更少的数据。如前所述，我在试图解决这个独特的问题时迷失了方向。很高兴您找到了解决方案。
stackoverflow.com/questions/1378593/…