using System;
using System.IO;
using System.Windows.Forms;
using Aspose.Pdf;
using Aspose.Pdf.Text;
using OfficeOpenXml;
namespace PDFTableExtractor
{
public partial class MainForm : Form
{
private string pdfFilePath = string.Empty;
private System.Data.DataTable extractedTable = null;
public MainForm()
{
InitializeComponent();
// 设置 EPPlus 许可证上下文(如果是非商业用途,可以使用社区许可证)
ExcelPackage.LicenseContext = LicenseContext.NonCommercial;
}
// 上传 PDF 按钮点击事件
private void btnUploadPDF_Click(object sender, EventArgs e)
{
using (OpenFileDialog openFileDialog = new OpenFileDialog())
{
openFileDialog.Filter = "PDF Files|*.pdf";
openFileDialog.Title = "选择 PDF 文件";
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
pdfFilePath = openFileDialog.FileName;
try
{
// 提取表格数据
extractedTable = ExtractTablesFromPDF(pdfFilePath);
if (extractedTable != null && extractedTable.Rows.Count > 0)
{
MessageBox.Show($"成功从 PDF 提取 {extractedTable.Rows.Count} 行数据!", "提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
btnExportExcel.Enabled = true;
}
else
{
MessageBox.Show("未从 PDF 中提取到表格数据。", "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning);
btnExportExcel.Enabled = false;
}
}
catch (Exception ex)
{
MessageBox.Show($"提取表格时发生错误:{ex.Message}", "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
}
}
}
}
// 从 PDF 提取表格数据
private System.Data.DataTable ExtractTablesFromPDF(string filePath)
{
System.Data.DataTable dataTable = new System.Data.DataTable("PDFTable");
bool tableInitialized = false;
// 加载 PDF 文档
Document pdfDocument = new Document(filePath);
// 初始化 TableAbsorber
TableAbsorber absorber = new TableAbsorber();
// 遍历所有页面
foreach (Page page in pdfDocument.Pages)
{
// 访问页面并提取表格
absorber.Visit(page);
// 遍历所有找到的表格
foreach (AbsorbedTable table in absorber.TableList)
{
// 遍历行
foreach (AbsorbedRow row in table.RowList)
{
// 如果是第一次处理表格,则创建列
if (!tableInitialized)
{
foreach (AbsorbedCell cell in row.CellList)
{
dataTable.Columns.Add(); // 可以根据需要设置列名
}
tableInitialized = true;
}
// 创建新行
System.Data.DataRow dataRow = dataTable.NewRow();
int colIndex = 0;
// 遍历单元格
foreach (AbsorbedCell cell in row.CellList)
{
string cellText = "";
// 提取单元格中的文本片段
foreach (TextFragment textFragment in cell.TextFragments)
{
cellText += textFragment.Text;
}
dataRow[colIndex++] = cellText.Trim();
}
dataTable.Rows.Add(dataRow);
}
}
}
return dataTable;
}
// 导出到 Excel 按钮点击事件
private void btnExportExcel_Click(object sender, EventArgs e)
{
if (extractedTable == null || extractedTable.Rows.Count == 0)
{
MessageBox.Show("没有数据可导出。", "提示", MessageBoxButtons.OK, MessageBoxIcon.Warning);
return;
}
// 选择保存位置
saveFileDialog1.FileName = $"PDF_Extracted_Table_{DateTime.Now:yyyyMMddHHmmss}.xlsx";
if (saveFileDialog1.ShowDialog() == DialogResult.OK)
{
try
{
SaveTableToExcel(extractedTable, saveFileDialog1.FileName);
MessageBox.Show($"数据已成功导出到:{saveFileDialog1.FileName}", "成功", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
catch (Exception ex)
{
MessageBox.Show($"导出 Excel 时发生错误:{ex.Message}", "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
}
}
}
// 将 DataTable 保存到 Excel
private void SaveTableToExcel(System.Data.DataTable dataTable, string filePath)
{
using (ExcelPackage excelPackage = new ExcelPackage(new FileInfo(filePath)))
{
ExcelWorksheet worksheet = excelPackage.Workbook.Worksheets.Add("提取的数据");
// 设置列头
for (int i = 0; i < dataTable.Columns.Count; i++)
{
worksheet.Cells[1, i + 1].Value = dataTable.Columns[i].ColumnName;
worksheet.Cells[1, i + 1].Style.Font.Bold = true;
}
// 填充数据
for (int row = 0; row < dataTable.Rows.Count; row++)
{
for (int col = 0; col < dataTable.Columns.Count; col++)
{
worksheet.Cells[row + 2, col + 1].Value = dataTable.Rows[row][col];
}
}
// 自动调整列宽
worksheet.Cells[worksheet.Dimension.Address].AutoFitColumns();
excelPackage.Save();
}
}
}
}