【发布时间】:2021-09-24 04:24:07
【问题描述】:
从 Html 文件中提取了一个标签
from bs4 import BeautifulSoup
html=open("centrimo.html")
parsed_html=BeautifulSoup(html)
script_data=parsed_cen.script
现在我想从脚本标签中包含的字符串中提取变量“sequences”、“neg_sequences”、“seqs”和“nseqs”中的信息。
<script type="text/javascript">
//@JSON_VAR data
var data = {
"version": "5.3.3",
"revision": "1667d7719daf2af1693cc039ba463bc4d2304d23",
"release": "Sun Feb 7 15:39:52 2021 -0800",
"program": "CentriMo",
"options": {
"cd": false,
"neg_sequences": true,
"noseq": false,
"mcc": false
},
"seqlen": 101,
"tested": 435,
"alphabet": {
"name": "DNA",
"like": "dna",
"ncore": 4
},
"background": [0.2788, 0.2212, 0.2212, 0.2788],
"sequences": [
"AT1G04100.1_CDS", "AT1G05860.1_CDS", "AT1G13910.1_CDS",
"AT1G21065.1_CDS", "AT1G26190.1_CDS", "AT1G32940.1_CDS",
"AT1G50575.1_CDS", "AT1G55810.1_CDS", "AT1G66430.1_CDS",
"AT1G71430.1_CDS", "AT1G77170.1_CDS", "AT1G78610.1_CDS",
"AT2G02955.1_CDS", "AT2G16280.1_CDS", "AT2G17080.1_CDS",
"AT2G19620.1_CDS", "AT2G19640.1_CDS", "AT2G30840.1_CDS",
"AT2G39450.1_CDS", "AT2G41380.1_CDS", "AT2G42580.1_CDS",
"AT3G01680.1_CDS", "AT3G05680.1_CDS", "AT3G20110.1_CDS",
"AT3G20260.1_CDS", "AT3G21360.1_CDS", "AT3G23070.1_CDS",
"AT3G23590.1_CDS", "AT3G46820.1_CDS", "AT3G48250.1_CDS",
"AT3G61200.1_CDS", "AT4G08510.1_CDS", "AT4G15070.1_CDS",
"AT4G24670.1_CDS", "AT4G25450.1_CDS", "AT4G28600.1_CDS",
"AT4G31910.1_CDS", "AT4G34810.1_CDS", "AT4G35030.3_CDS",
"AT4G37170.1_CDS", "AT4G38630.1_CDS", "AT4G39720.1_CDS",
"AT5G07340.1_CDS", "AT5G12970.1_CDS", "AT5G13470.1_CDS",
"AT5G18950.1_CDS", "AT5G22840.1_CDS", "AT5G25590.1_CDS",
"AT5G27395.1_CDS", "AT5G53370.1_CDS", "AT5G63610.1_CDS",
"AT5G64830.1_CDS", "AT5G64900.1_CDS", "AT5G67620.1_CDS"
],
"neg_sequences": [
"AT1G01600.1_CDS", "AT2G32480.1_CDS", "AT2G41740.1_CDS",
"AT3G19490.1_CDS", "AT3G24030.1_CDS", "AT3G25580.1_CDS",
"AT3G48330.1_CDS", "AT3G59220.1_CDS", "AT4G13340.1_CDS",
"AT4G33590.1_CDS", "AT5G03080.1_CDS", "AT5G23700.1_CDS",
"AT5G41010.1_CDS"
],
"motifs": [
{
"db": 2,
"id": "ath-miR419",
"alt": "MIMAT0001327",
"consensus": "CAACATCCTCAGCATTCATAA",
"len": 21,
"motif_evalue": "0.0e+000",
"motif_nsites": 20,
"n_tested": 40,
"score_threshold": 5,
"url": "http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=MIMAT0001327",
"pwm": [
[0.164036, 0.479478, 0.163749, 0.192738],
[0.479764, 0.163749, 0.192452, 0.164036]
],
"total_sites": 10,
"sites": [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
],
"neg_total_sites": 2,
"neg_sites": [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
],
"seqs": [0, 1, 13, 15, 16, 23, 27, 36, 44, 48],
"neg_seqs": [3, 10],
"peaks": [
{
"center": 0,
"fisher_log_adj_pvalue": 0
}
]
}
]
};
</script>
我尝试将对象转换为json类型的对象,但出现以下错误,
import json
j_script = json.loads(script_data.string)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.7/json/__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3.7/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 7 (char 7)
提前致谢
附言。可以找到我想解析的完整 html 文件的示例 (here)
编辑:在原始帖子中,我提到我遇到了缩进错误。这是在尝试通过删除所有空格、“\n”字符来手动编辑 json 对象后发生的。虽然我认为它不会从根本上改变问题,但我为这个错误道歉
[更新] 我能够在post 中调整答案如下
tmp=script_data.string.partition('=')
j_tmp=tmp[2].replace(";\n ","")
j_script=json.loads(j_tmp)
第二行有点笨拙(我无法在另一个post 中调整答案),但总的来说它可以解决问题。现在我正在尝试获取“motifs”列表中包含的“seqs”数据。
对上面代码第二行的帮助将不胜感激
【问题讨论】:
-
可能是它包含了赋值
var data =而不仅仅是script_data 字符串中的对象 -
抱歉,实际上我从 json.loads 命令得到的错误是不同的,并且指向输入中的不同行。因为原始输入更长,更复杂一些。我尝试手动编辑(通过删除制表符、换行符甚至 \n 字符),然后测试该命令。那是我收到缩进错误的时候。现在我离开了电脑,但一旦我回来,我会正确编辑帖子
-
有一种方法可以获取目标数据,但是我从页面中获取的数据与您问题中的数据不同。比如
"version": "5.3.3"就是"version": "5.3.2"等等
标签: python json html-parsing