import docx, io from docx import Document from docx.document import Document as _Document from docx.oxml.text.paragraph import CT_P from docx.oxml.table import CT_Tbl from docx.table import _Cell, Table from docx.text.paragraph import Paragraph def iter_block_items(parent): """ Generate a reference to each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, _Document): parent_elm = parent.element.body #print("Parent1 XML: ", parent_elm) elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) def get_df_row_para(num,block): paragraph = block para_text = paragraph.text.strip().strip("\n") return [para_text] def get_df_row_table(num,block): rows = [] table = block for row in table.rows: tpara = [] for cell in row.cells: for paragraph in cell.paragraphs: para_text = paragraph.text.strip().strip("\n") tpara.append(para_text) para_text = str(" ".join(tpara)) rows.append([para_text]) num+=1 return num, rows def txt_from_docx(document_,with_api=True): """ document_path: path of docx file Return a df with all the lines in docx file """ # to keep out false positives you can keep a list of rejected headings if not with_api: document = Document(io.BytesIO(document_)) else: document = Document(document_) rows = [] num = 1 for block in iter_block_items(document): if isinstance(block, Paragraph): row = get_df_row_para(num,block) rows.extend(row) num = num+1 elif isinstance(block, Table): num, rows_ = get_df_row_table(num,block) rows += rows_ return rows