【发布时间】:2018-02-06 01:33:31
【问题描述】:
我在一个 EC2 实例上运行 RStudio,该实例具有与之关联的 IAM 角色,允许对其进行完全 S3 访问。我想将一个文件从 S3 读入 RStudio。
我尝试通过sparklyr 进行如下操作:
spark_install(version = "2.1.0")
sc <- spark_connect(master = "local")
ctx <- sparklyr::spark_context(sc)
#Use below to set the java spark context
jsc <- invoke_static(
sc,
"org.apache.spark.api.java.JavaSparkContext",
"fromSparkContext",
ctx
)
hconf <- jsc %>% invoke("hadoopConfiguration")
hconf %>% invoke("set","fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hconf %>% invoke("set","fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
usercsv_tbl <- spark_read_csv(sc,name = "temp",path = "s3a://<bucket>/filename.csv")
我收到以下错误:
Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:372)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:370)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:370)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:415)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:352)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at sparklyr.Invoke$.invoke(invoke.scala:102)
at sparklyr.StreamHandler$.handleMethodCall(stream.scala:97)
at sparklyr.StreamHandler$.read(stream.scala:62)
at sparklyr.BackendHandler.channelRead0(handler.scala:52)
at sparklyr.BackendHandler.channelRead0(handler.scala:14)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:293)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:267)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 52 more
我也试过使用cloudyr的aws.s3包如下:
library(aws.s3)
get_bucket(bucket = <bucketname>)
我收到以下错误:
List of 4
$ Code : chr "AccessDenied"
$ Message : chr "Access Denied"
$ RequestId: chr "CF4041D52D7523D2"
$ HostId : chr "vtkUIF7qsUwlGxBUaDpfXk9f6QHIelLxcsV0Nigla9yJicBl1YpxtrgGr82IoMyYPu6uvDSpAGI="
- attr(*, "headers")=List of 6
..$ x-amz-request-id : chr "CF4041D52D7523D2"
..$ x-amz-id-2 : chr "vtkUIF7qsUwlGxBUaDpfXk9f6QHIelLxcsV0Nigla9yJicBl1YpxtrgGr82IoMyYPu6uvDSpAGI="
..$ content-type : chr "application/xml"
..$ transfer-encoding: chr "chunked"
..$ date : chr "Mon, 28 Aug 2017 17:49:48 GMT"
..$ server : chr "AmazonS3"
..- attr(*, "class")= chr [1:2] "insensitive" "list"
- attr(*, "class")= chr "aws_error"
NULL
Error in parse_aws_s3_response(r, Sig, verbose = verbose) :
Forbidden (HTTP 403).
如何使用 EC2 的 IAM 角色访问 S3 数据,这样我就不必手动输入我的凭据?
【问题讨论】:
-
对于 cloudyr,您必须安装 aws.ec2metadata 包才能获取 EC2 元数据。您可以通过调用
aws.signature::locate_credentials()来检查它是否有效 -
谢谢!成功了!
-
你能证明什么有效吗?我有同样的问题,即使我可以通过这种方式获取凭据,我似乎无法让
aws.s3实际使用它们来获取存储桶。 -
@charmander 什么对你有用?
aws.signature::locate_credentials()仅在您明确定义凭证时才定位它们,但它不适用于 IAM 角色(您的问题是关于)
标签: r amazon-web-services amazon-s3 amazon-ec2 sparklyr