Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

# -*- coding: UTF-8 -*- 

# Copyright 2011 Luc Saffre 

# License: BSD (see file COPYING for details) 

 

""" 

This is taken from Helio Perroni Filho's answer at 

http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/3987802#3987802 

 

I added the extract_summary() function.  

 

Lino never really used this module.  

It was added and dropped the same day  

for the server-side approach of :srcref:`docs/tickets/44`. 

""" 

from __future__ import print_function 

from future import standard_library 

standard_library.install_aliases() 

 

from html.parser import HTMLParser 

from re import sub 

from sys import stderr 

from traceback import print_exc 

 

 

class _DeHTMLParser(HTMLParser): 

 

    def __init__(self): 

        HTMLParser.__init__(self) 

        self.__text = [] 

 

    def handle_data(self, data): 

        text = data.strip() 

        if len(text) > 0: 

            text = sub('[ \t\r\n]+', ' ', text) 

            self.__text.append(text + ' ') 

 

    def handle_starttag(self, tag, attrs): 

        if tag == 'p': 

            self.__text.append('\n\n') 

        elif tag == 'br': 

            self.__text.append('\n') 

 

    def handle_startendtag(self, tag, attrs): 

        if tag == 'br': 

            self.__text.append('\n\n') 

 

    def text(self): 

        return ''.join(self.__text).strip() 

 

 

def dehtml(text): 

    try: 

        parser = _DeHTMLParser() 

        parser.feed(text) 

        parser.close() 

        return parser.text() 

    except: 

        print_exc(file=stderr) 

        return text 

 

 

def extract_summary(text): 

    if text.startswith('<'): 

        text = dehtml(text) 

    a = text.split('\n', 1) 

    ellipsis = False 

    if len(a) > 1: 

        ellipsis = True 

    ln = text.split('\n', 1)[0] 

    if len(ln) > 30: 

        ln = ln[:30] 

        ellipsis = True 

    if ellipsis: 

        ln += "..." 

    return ln 

 

 

def main(): 

    text = r''' 

        <html> 

            <body> 

                <b>Project:</b> DeHTML<br> 

                <b>Description</b>:<br> 

                This small script is intended to allow conversion from HTML markup to  

                plain text. 

            </body> 

        </html> 

    ''' 

    print(dehtml(text)) 

 

 

if __name__ == '__main__': 

    main()