Kouhei Sutou
null+****@clear*****
Wed Jul 5 16:57:43 JST 2017
Kouhei Sutou 2017-07-05 16:57:43 +0900 (Wed, 05 Jul 2017) New Revision: c2bc94ac549538f134706b66a721dc622bc2c19c https://github.com/ranguba/chupa-text-decomposer-html/commit/c2bc94ac549538f134706b66a721dc622bc2c19c Message: Ignore needless contents Modified files: lib/chupa-text/decomposers/html.rb test/test-html.rb Modified: lib/chupa-text/decomposers/html.rb (+78 -1) =================================================================== --- lib/chupa-text/decomposers/html.rb 2017-07-05 16:30:52 +0900 (b23c991) +++ lib/chupa-text/decomposers/html.rb 2017-07-05 16:57:43 +0900 (b753dc5) @@ -45,7 +45,7 @@ module ChupaText doc = Nokogiri::HTML.parse(html, nil, guess_encoding(html)) body_element = (doc % "body") if body_element - body = body_element.text.scrub.gsub(/^\s+|\s+$/, '') + body = extract_text(body_element, "").scrub.gsub(/^\s+|\s+$/, '') else body = "" end @@ -104,6 +104,83 @@ module ChupaText def guess_encoding_nkf(text) NKF.guess(text).name end + + def extract_text(element, text) + name = element.name.downcase + classes = (element["class"] || "").split + return text if noindex_element?(element, name, classes) + return text if header_element?(element, name, classes) + return text if footer_element?(element, name, classes) + + element.children.each do |child| + case child + when Nokogiri::XML::Text + text << child.text + when Nokogiri::XML::Element + extract_text(child, text) + end + end + + text + end + + def noindex_element?(element, name, classes) + case name + when "script", "noscript", "link", "style" + return true + end + + classes.each do |klass| + case klass + when "noindex", "robots-noindex" + return true + end + end + + false + end + + def header_element?(element, name, classes) + case name + when "header", "nav" + return true + end + + classes.each do |klass| + case klass + when "header" + return true + end + end + + case element["id"] + when "header" + return true + end + + false + end + + def footer_element?(element, name, classes) + case name + when "footer" + return true + end + + classes.each do |klass| + case klass + when "footer" + return true + end + end + + case element["id"] + when "footer" + return true + end + + false + end end end end Modified: test/test-html.rb (+132 -0) =================================================================== --- test/test-html.rb 2017-07-05 16:30:52 +0900 (b5b24a7) +++ test/test-html.rb 2017-07-05 16:57:43 +0900 (1477677) @@ -273,5 +273,137 @@ class TestHTML < Test::Unit::TestCase end end end + + sub_test_case("body") do + def normalize_decomposed_data(decomposed_data) + decomposed_data.body + end + + sub_test_case("noindex") do + def test_script + @data.body = <<-HTML +<html> + <body>Before<script>var x;</script>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_noscript + @data.body = <<-HTML +<html> + <body>Before<noscript>Enable JavaScript!</noscript>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_link + @data.body = <<-HTML +<html> + <body>Before<link rel="stylehseet">After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_style + @data.body = <<-HTML +<html> + <body>Before<style>a {color: "red";}</style>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_noindex + @data.body = <<-HTML +<html> + <body>Before<div class="noindex">header</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_robots_noindex + @data.body = <<-HTML +<html> + <body>Before<div class="robots-noindex">header</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + end + + sub_test_case("header") do + def test_tag + @data.body = <<-HTML +<html> + <body>Before<header>header</header>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_class + @data.body = <<-HTML +<html> + <body>Before<div class="header">header</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_id + @data.body = <<-HTML +<html> + <body>Before<div id="header">header</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + end + + sub_test_case("footer") do + def test_tag + @data.body = <<-HTML +<html> + <body>Before<footer>footer</footer>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_class + @data.body = <<-HTML +<html> + <body>Before<div class="footer">footer</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + + def test_id + @data.body = <<-HTML +<html> + <body>Before<div id="footer">footer</div>After</body> +</html> + HTML + assert_equal(["BeforeAfter"], + decompose(@data)) + end + end + end end end -------------- next part -------------- HTML����������������������������... Télécharger