【发布时间】:2018-06-12 03:21:23
【问题描述】:
我创建了一个使用dplyr 库的存储过程,目的是按StudyID 和ProductNumber 分组。我想返回字段 c1-c8 中每个值的平均值以及相同字段的标准偏差。
我的存储过程如下:
ALTER PROCEDURE [dbo].[spCodeMeans]
@StudyID INT,
@StudyID_outer INT OUT
AS
BEGIN
SET NOCOUNT ON;
DECLARE @inquery NVARCHAR(MAX) = N'Select
c.StudyID, c.RespID, c.ProductNumber, c.ProductSequence, c.BottomScaleValue,
c.BottomScaleAnchor, c.TopScaleValue, c.TopScaleAnchor, c.StudyDate,
c.DayOfWeek, c.A, c.B, c.C, c.D, c.E, c.F,
c.DependentVarYN, c.VariableAttributeID, c.VarAttributeName, c.[1] as c1,
c.[2] as c2, c.[3] as c3, c.[4] as c4, c.[5] as c5, c.[6] as c6, c.[7] as c7, c.[8] as c8
from ClosedStudyResponses c
--Sensory Value Attributes only for mean and standard deviation analytics.
where VariableAttributeID = 1
and c.StudyID = 22'
;
BEGIN TRY
EXEC sp_execute_external_script
@language = N'R',
@script = N'
library(dplyr)
OutputDataSet <- InputDataSet %>%
group_by (StudyID, ProductNumber) %>%
summarise(c1_mean = mean(c1), c2_mean = mean(c2), c3_mean = mean(c3), c4_mean = mean(c4), c5_mean = mean(c5), c6_mean = mean(c6),
c7_mean = mean(c7), c8_mean = mean(c8), c1_sd = sd(c1), c2_sd = sd(c2), c3_sd = sd(c3), c4_sd = sd(c4), c5_sd = sd(c5), c6_sd = sd(c6),
c7_sd = sd(c7), c8_sd = sd(c8)) %>%
`colnames<-`(c("StudyID", "ProductNumber","c1_mean","c2_mean","c3_mean","c4_mean","c5_mean","c6_mean","c7_mean",
"c8_mean","c1_sd","c2_sd","c3_sd","c4_sd","c5_sd","c6_sd","c7_sd","c8_sd"))',
@input_data_1 = @inquery
END TRY
BEGIN CATCH
THROW;
END CATCH
结果不包括列名,也不包括字段 c1 到 c8 的平均值和标准差。如何调整我的语法以完成此操作?
更新: 根据建议,我修改了存储过程以将结果插入到临时表中。修改后的语法如下:
ALTER PROCEDURE [dbo].[spCodeMeans]
-- Add the parameters for the stored procedure here
@StudyID int,
@StudyID_outer int OUT
AS
BEGIN
-- SET NOCOUNT ON added to prevent extra result sets from
-- interfering with SELECT statements.
SET NOCOUNT ON;
--Create temptable to store the outputdataset
Create table #temp_table (
StudyID int,
ProductNumber int,
c1_mean decimal,
c2_mean decimal,
c3_mean decimal,
c4_mean decimal,
c5_mean decimal,
c6_mean decimal,
c7_mean decimal,
c8_mean decimal,
c1_sd decimal,
c2_sd decimal,
c3_sd decimal,
c4_sd decimal,
c5_sd decimal,
c6_sd decimal,
c7_sd decimal,
c8_sd decimal
);
-- Insert statements for procedure here
Declare @inquery nvarchar(max) = N'Select
c.StudyID, c.RespID, c.ProductNumber, c.ProductSequence, c.BottomScaleValue,
c.BottomScaleAnchor, c.TopScaleValue, c.TopScaleAnchor, c.StudyDate,
c.DayOfWeek, c.A, c.B, c.C, c.D, c.E, c.F,
c.DependentVarYN, c.VariableAttributeID, c.VarAttributeName, c.[1] as c1,
c.[2] as c2, c.[3] as c3, c.[4] as c4, c.[5] as c5, c.[6] as c6, c.[7] as c7, c.[8] as c8
from ClosedStudyResponses c
--Sensory Value Attributes only for mean and standard deviation analytics.
where VariableAttributeID = 1
and c.StudyID = 22'
;
BEGIN TRY
Insert into #temp_table
exec sp_execute_external_script
@language = N'R',
@script = N'
library(dplyr)
OutputDataSet <- InputDataSet %>%
group_by (StudyID, ProductNumber) %>%
summarise_all(.funs=c(mean, sd)) %>%
setNames(c("StudyID","ProductNumber",
paste0("c",1:8, "_mean"),
paste0("c",1:8, "_sd")))
',
@input_data_1 = @inquery,
@output_data_1 = N'OutputDataSet';
END TRY
BEGIN CATCH
THROW;
END CATCH
Select * from #temp_table;
END
当我尝试运行该过程时,我收到一条错误消息,指出 Procedure expects parameter '@params' of type 'ntext/nchar/nvarchar'. 请注意,我已经将 @inquery 声明为 NVARCHAR(Max)。是否还有其他步骤我忽略了?
更新 #2: 我花了一些时间重新处理存储过程,发现输出需要采用 data.frame 的格式。我相应地修改了存储过程的 R 部分。现在我的列名出现了,但没有返回数据、平均值或标准差。当前存储过程如下:
ALTER PROCEDURE [dbo].[spCodeMeans]
-- Add the parameters for the stored procedure here
@StudyID int
AS
BEGIN
-- SET NOCOUNT ON added to prevent extra result sets from
-- interfering with SELECT statements.
SET NOCOUNT ON;
-- Insert statements for procedure here
Declare @sStudy varchar(50)
Set @sStudy = Convert(Varchar(50),@StudyID)
Declare @inquery nvarchar(max) = N'Select
c.StudyID, c.RespID, c.ProductNumber, c.ProductSequence, c.BottomScaleValue,
c.BottomScaleAnchor, c.TopScaleValue, c.TopScaleAnchor, c.StudyDate,
c.DayOfWeek, c.A, c.B, c.C, c.D, c.E, c.F,
c.DependentVarYN, c.VariableAttributeID, c.VarAttributeName, c.[1] as c1,
c.[2] as c2, c.[3] as c3, c.[4] as c4, c.[5] as c5, c.[6] as c6, c.[7] as c7, c.[8] as c8
from ClosedStudyResponses c
--Sensory Value Attributes only for mean and standard deviation analytics.
where VariableAttributeID = 1
and c.StudyID =' +@sStudy ;
BEGIN TRY
--Insert into CodeMeans
exec sp_execute_external_script
@language = N'R',
@script = N'
library(dplyr)
codemeans <- function(StudyID){
res <- InputDataSet %>%
group_by (StudyID, ProductNumber) %>%
summarise_all(.funs=c(mean, sd)) %>%
setNames(c("StudyID","ProductNumber",
paste0("c",1:8, "_mean"),
paste0("c",1:8, "_sd")))
df <- data.frame(res)
}
',
@input_data_1 = @inquery,
@output_data_1_name = N'df',
@params = N'@StudyID int',
@StudyID = @StudyID
END TRY
BEGIN CATCH
THROW;
END CATCH
Select * from CodeMeans;
END
因此,此时,我正在寻求有关如何从基本查询返回输出以及均值和标准差的指导。
【问题讨论】:
-
那么输出包括什么?另外,为什么不在 SQL 中运行标准聚合查询呢?为什么使用 R?支持均值和标准差aggregate functions。
标签: r sql-server tsql dplyr