现在的位置: 首页 > 综合 > 正文

002_026 Python 从OpenOffice.org的文档中提取文本

2018年02月14日 ⁄ 综合 ⁄ 共 8537字 ⁄ 字号 评论关闭

代码如下:

#encoding=utf-8

print '中国'

#从OpenOffice.org的文档中提取文本

#1.下载OpenOffice http://rj.baidu.com/soft/detail/15989.html?ald
#2.编译一个文档
#3.保存为sxw即 OpenOffice.org XML 1.0文档

import zipfile,re
rx_stripxml = re.compile('<[^>]*?', re.DOTALL|re.MULTILINE)
def convert_OO(filename, want_text=True):
    zf = zipfile.ZipFile(filename, "r")
    data = zf.read('content.xml')
    zf.close()
    if want_text:
        data="".join(rx_stripxml.sub("",data).split())
    return data

print '---------1 Text'
filename=r'C:\Users\Administrator\Desktop\test.sxw'
data = convert_OO(filename)
print data

print '---------2 XML'
data = convert_OO(filename,False)
print data

打印结果如下:

中国
---------1 Text
?xmlversion="1.0"encoding="UTF-8"?>office:document-contentxmlns:office="http://openoffice.org/2000/office"xmlns:style="http://openoffice.org/2000/style"xmlns:text="http://openoffice.org/2000/text"xmlns:table="http://openoffice.org/2000/table"xmlns:draw="http://openoffice.org/2000/drawing"xmlns:fo="http://www.w3.org/1999/XSL/Format"xmlns:xlink="http://www.w3.org/1999/xlink"xmlns:dc="http://purl.org/dc/elements/1.1/"xmlns:meta="http://openoffice.org/2000/meta"xmlns:number="http://openoffice.org/2000/datastyle"xmlns:svg="http://www.w3.org/2000/svg"xmlns:chart="http://openoffice.org/2000/chart"xmlns:dr3d="http://openoffice.org/2000/dr3d"xmlns:math="http://www.w3.org/1998/Math/MathML"xmlns:form="http://openoffice.org/2000/form"xmlns:script="http://openoffice.org/2000/script"xmlns:ooo="http://openoffice.org/2004/office"xmlns:ooow="http://openoffice.org/2004/writer"xmlns:oooc="http://openoffice.org/2004/calc"xmlns:dom="http://www.w3.org/2001/xml-events"xmlns:xforms="http://www.w3.org/2002/xforms"xmlns:xsd="http://www.w3.org/2001/XMLSchema"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xmlns:rpt="http://openoffice.org/2005/report"xmlns:of="urn:oasis:names:tc:opendocument:xmlns:of:1.2"xmlns:xhtml="http://www.w3.org/1999/xhtml"xmlns:grddl="http://www.w3.org/2003/g/data-view#"xmlns:tableooo="http://openoffice.org/2009/table"xmlns:field="urn:openoffice:names:experimental:ooo-ms-interop:xmlns:field:1.0"office:version="1.2"office:class="text">office:script/>office:font-decls>style:font-declstyle:name="Mangal1"fo:font-family="Mangal"/>style:font-declstyle:name="Mangal2"fo:font-family="Mangal"style:font-pitch="variable"/>style:font-declstyle:name="宋体1"fo:font-family="宋体"style:font-pitch="variable"/>style:font-declstyle:name="微软雅黑"fo:font-family="微软雅黑"style:font-pitch="variable"/>style:font-declstyle:name="TimesNewRoman"fo:font-family="&apos;TimesNewRoman&apos;"style:font-family-generic="roman"style:font-pitch="variable"/>style:font-declstyle:name="Arial"fo:font-family="Arial"style:font-family-generic="swiss"style:font-pitch="variable"/>style:font-declstyle:name="Mangal"fo:font-family="Mangal"style:font-family-generic="system"style:font-pitch="variable"/>style:font-declstyle:name="宋体"fo:font-family="宋体"style:font-family-generic="system"style:font-pitch="variable"/>/office:font-decls>office:automatic-styles>style:stylestyle:name="P1"style:family="paragraph"style:parent-style-name="Standard"style:master-page-name="Standard">style:propertiesstyle:page-number="auto"/>/style:style>style:stylestyle:name="T1"style:family="text">style:propertiesfo:color="#800000"/>/style:style>style:stylestyle:name="fr1"style:family="graphics"style:parent-style-name="Graphics">style:propertiesstyle:horizontal-pos="center"style:horizontal-rel="paragraph"style:mirror="none"draw:mirror="false"fo:clip="rect(0cm,0cm,0cm,0cm)"draw:luminance="0%"draw:contrast="0%"draw:red="0%"draw:green="0%"draw:blue="0%"draw:gamma="1"draw:color-inversion="false"draw:color-mode="standard"draw:transparency="0%"/>/style:style>/office:automatic-styles>office:body>office:formsform:automatic-focus="false"form:apply-design-mode="false"/>text:sequence-decls>text:sequence-decltext:display-outline-level="0"text:name="Illustration"/>text:sequence-decltext:display-outline-level="0"text:name="Table"/>text:sequence-decltext:display-outline-level="0"text:name="Text"/>text:sequence-decltext:display-outline-level="0"text:name="Drawing"/>/text:sequence-decls>text:ptext:style-name="P1">IloveChina./text:p>text:ptext:style-name="Standard">我text:spantext:style-name="T1">喜欢中国。/text:span>/text:p>text:ptext:style-name="Standard"/>text:ptext:style-name="Standard">draw:imagedraw:style-name="fr1"draw:name="图形1"text:anchor-type="paragraph"svg:width="16.934cm"svg:height="12.701cm"draw:z-index="0"xlink:href="#Pictures/1000000000000280000001E0EE6E838D.jpg"xlink:type="simple"xlink:show="embed"xlink:actuate="onLoad"/>/text:p>/office:body>/office:document-content>
---------2 XML
<?xml version="1.0" encoding="UTF-8"?>
<office:document-content xmlns:office="http://openoffice.org/2000/office" xmlns:style="http://openoffice.org/2000/style" xmlns:text="http://openoffice.org/2000/text" xmlns:table="http://openoffice.org/2000/table" xmlns:draw="http://openoffice.org/2000/drawing"
xmlns:fo="http://www.w3.org/1999/XSL/Format" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:meta="http://openoffice.org/2000/meta" xmlns:number="http://openoffice.org/2000/datastyle" xmlns:svg="http://www.w3.org/2000/svg"
xmlns:chart="http://openoffice.org/2000/chart" xmlns:dr3d="http://openoffice.org/2000/dr3d" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns:form="http://openoffice.org/2000/form" xmlns:script="http://openoffice.org/2000/script" xmlns:ooo="http://openoffice.org/2004/office"
xmlns:ooow="http://openoffice.org/2004/writer" xmlns:oooc="http://openoffice.org/2004/calc" xmlns:dom="http://www.w3.org/2001/xml-events" xmlns:xforms="http://www.w3.org/2002/xforms" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:rpt="http://openoffice.org/2005/report" xmlns:of="urn:oasis:names:tc:opendocument:xmlns:of:1.2" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:grddl="http://www.w3.org/2003/g/data-view#" xmlns:tableooo="http://openoffice.org/2009/table" xmlns:field="urn:openoffice:names:experimental:ooo-ms-interop:xmlns:field:1.0"
office:version="1.2" office:class="text"><office:script/><office:font-decls><style:font-decl style:name="Mangal1" fo:font-family="Mangal"/><style:font-decl style:name="Mangal2" fo:font-family="Mangal" style:font-pitch="variable"/><style:font-decl style:name="宋体1"
fo:font-family="宋体" style:font-pitch="variable"/><style:font-decl style:name="微软雅黑" fo:font-family="微软雅黑" style:font-pitch="variable"/><style:font-decl style:name="Times New Roman" fo:font-family="&apos;Times New Roman&apos;" style:font-family-generic="roman"
style:font-pitch="variable"/><style:font-decl style:name="Arial" fo:font-family="Arial" style:font-family-generic="swiss" style:font-pitch="variable"/><style:font-decl style:name="Mangal" fo:font-family="Mangal" style:font-family-generic="system" style:font-pitch="variable"/><style:font-decl
style:name="宋体" fo:font-family="宋体" style:font-family-generic="system" style:font-pitch="variable"/></office:font-decls><office:automatic-styles><style:style style:name="P1" style:family="paragraph" style:parent-style-name="Standard" style:master-page-name="Standard"><style:properties
style:page-number="auto"/></style:style><style:style style:name="T1" style:family="text"><style:properties fo:color="#800000"/></style:style><style:style style:name="fr1" style:family="graphics" style:parent-style-name="Graphics"><style:properties style:horizontal-pos="center"
style:horizontal-rel="paragraph" style:mirror="none" draw:mirror="false" fo:clip="rect(0cm, 0cm, 0cm, 0cm)" draw:luminance="0%" draw:contrast="0%" draw:red="0%" draw:green="0%" draw:blue="0%" draw:gamma="1" draw:color-inversion="false" draw:color-mode="standard"
draw:transparency="0%"/></style:style></office:automatic-styles><office:body><office:forms form:automatic-focus="false" form:apply-design-mode="false"/><text:sequence-decls><text:sequence-decl text:display-outline-level="0" text:name="Illustration"/><text:sequence-decl
text:display-outline-level="0" text:name="Table"/><text:sequence-decl text:display-outline-level="0" text:name="Text"/><text:sequence-decl text:display-outline-level="0" text:name="Drawing"/></text:sequence-decls><text:p text:style-name="P1">I love China.</text:p><text:p
text:style-name="Standard">我<text:span text:style-name="T1">喜欢中国。</text:span></text:p><text:p text:style-name="Standard"/><text:p text:style-name="Standard"><draw:image draw:style-name="fr1" draw:name="图形1" text:anchor-type="paragraph" svg:width="16.934cm"
svg:height="12.701cm" draw:z-index="0" xlink:href="#Pictures/1000000000000280000001E0EE6E838D.jpg" xlink:type="simple" xlink:show="embed" xlink:actuate="onLoad"/></text:p></office:body></office:document-content>

抱歉!评论已关闭.