You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def similarity2(s1, s2):
"""
get similarity of two strings
:param s1:
:param s2:
:return:
"""
if not s1 or not s2:
return 0
s1_set = set(list(s1))
s2_set = set(list(s2))
intersection = s1_set.intersection(s2_set)
union = s1_set.intersection(s2_set)
return len(intersection) / len(union)
union = s1_set.intersection(s2_set) # 这个应该是并集才对吧,源码里边应该是取错了
The text was updated successfully, but these errors were encountered:
from gerapy_auto_extractor.extractors.base import BaseExtractor
from lxml.html import HtmlElement, fromstring
from gerapy_auto_extractor.patterns.title import METAS
from gerapy_auto_extractor.utils.lcs import lcs_of_2
from gerapy_auto_extractor.utils.similarity import similarity2
class TitleExtractor(BaseExtractor):
"""
Title Extractor which extract title of page
"""
def extract_by_meta(self, element: HtmlElement) -> str:
"""
extract according to meta
:param element:
:return: str
"""
for xpath in METAS:
title = element.xpath(xpath)
if title:
return ''.join(title)
def extract_by_title(self, element: HtmlElement):
"""
get title from <title> tag
:param element:
:return:
"""
return ''.join(element.xpath('//title//text()')).strip()
def extract_by_hs(self, element: HtmlElement):
"""
get title from all h1-h3 tag
:param element:
:return:
"""
hs = element.xpath('//h1//text()|//h2//text()|//h3//text()')
return hs or []
def extract_by_h(self, element: HtmlElement):
"""
extract by h tag, priority h1, h2, h3
:param elemeent:
:return:
"""
for xpath in ['//h1', '//h2', '//h3']:
children = element.xpath(xpath)
if not children:
continue
child = children[0]
texts = child.xpath('./text()')
if texts and len(texts):
return texts[0].strip()
def process(self, element: HtmlElement):
"""
extract title from element
:param element:
:return:
"""
title_extracted_by_meta = self.extract_by_meta(element)
title_extracted_by_h = self.extract_by_h(element)
title_extracted_by_hs = self.extract_by_hs(element)
title_extracted_by_title = self.extract_by_title(element)
# split logic to add more
if title_extracted_by_meta:
return title_extracted_by_meta
title_extracted_by_hs = sorted(title_extracted_by_hs, key=lambda x: len(x), reverse=True) # 最长字符的标签放前面
max_dict = {}
for index, h in enumerate(title_extracted_by_hs):
jd = similarity2(h, title_extracted_by_title)
if jd > 0:
max_dict[index] = int(jd)
key = 10
for k, v in max_dict.items():
if v == max(max_dict.values()):
key = k
print(title_extracted_by_hs[key])
if title_extracted_by_hs:
return title_extracted_by_hs[key]
return title_extracted_by_title
title_extractor = TitleExtractor()
def extract_title(html):
"""
extract title from html
:param html:
:return:
"""
result = title_extractor.extract(html)
return result
def similarity2(s1, s2):
"""
get similarity of two strings
:param s1:
:param s2:
:return:
"""
if not s1 or not s2:
return 0
s1_set = set(list(s1))
s2_set = set(list(s2))
intersection = s1_set.intersection(s2_set)
union = s1_set.intersection(s2_set)
return len(intersection) / len(union)
The text was updated successfully, but these errors were encountered: