
PDFファイルよりテキスト抽出 PdfiumViewer C#

PdfiumViewer.Native.x86.v8-xfa (32bitのとき)
PdfiumViewer.Native.x86_64.v8-xfa (64bitのとき)



using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Drawing;
using System.Linq;
using System.Reflection.Emit;
using System.Runtime.InteropServices;
using System.Security.Cryptography;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using static System.Windows.Forms.VisualStyles.VisualStyleElement;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using System.Drawing.Imaging;
using System.IO;
using static System.Net.Mime.MediaTypeNames;
using Shell32;
using Windows.Storage.Streams;
using Windows.Storage;
using Windows.UI.Xaml.Controls;
using Windows.UI.Xaml.Controls.Primitives;
using static System.Net.WebRequestMethods;
using System.Runtime.InteropServices.ComTypes;
using Windows.Foundation;
using Windows.UI.Xaml.Shapes;
using Point = System.Drawing.Point;
using System.Runtime.InteropServices.WindowsRuntime;
using Image = System.Drawing.Image;
using Path = System.IO.Path;
using File = System.IO.File;
using PdfiumViewer;

namespace WindowsFormsApp1
    public partial class Form1 : Form
        //control clsResize
        clsResize _form_resize;

        private static Form1 _form1Instance;

        private const string IMG_PATH_SEP = "_";
        private const string IMG_PATH_EXT = ".jpg";

        private int IMG_DPI = 300;
        private long JPEG_QUALITY = 100;

        private string pdf_path = "";

        public Form1()

            _form_resize = new clsResize(this); //I put this after the initialize event to be sure that all controls are initialized properly

            this.Load += new EventHandler(_Load); //This will be called after the initialization // form_load
            this.Resize += new EventHandler(_Resize); //form_resize


        //clsResize _Load 
        private void _Load(object sender, EventArgs e)
            textBox3.Text = "300";
            textBox4.Text = "100";


        //clsResize _Resize
        private void _Resize(object sender, EventArgs e)

        private void Open()
            OpenFileDialog ofd = new OpenFileDialog();

            ofd.FileName = "default.pdf";
            ofd.InitialDirectory = System.Environment.CurrentDirectory + @"\";
            ofd.Filter = "PDFファイル(*.pdf)|*.pdf|JPEGファイル(*.jpg)|*.jpg|すべてのファイル(*.*)|*.*";
            ofd.FilterIndex = 1;
            ofd.Title = "開くファイルを選択してください";
            ofd.RestoreDirectory = true;
            ofd.CheckFileExists = true;
            ofd.CheckPathExists = true;

            if (ofd.ShowDialog() == DialogResult.OK)
                webView21.Source = new Uri(ofd.FileName);
                textBox1.Text = ofd.FileName;
                pdf_path = ofd.FileName;


        private void PDF選択ToolStripMenuItem_Click(object sender, EventArgs e)

        private void JPEG変換ToolStripMenuItem_Click(object sender, EventArgs e)

        private void テキスト抽出ToolStripMenuItem_Click(object sender, EventArgs e)

        private void テキスト消去ToolStripMenuItem_Click(object sender, EventArgs e)
            textBox2.Text = "";

        private void テキスト保存ToolStripMenuItem_Click(object sender, EventArgs e)

        private void 終了ToolStripMenuItem_Click(object sender, EventArgs e)

        private void button1_Click(object sender, EventArgs e)

        private void button2_Click(object sender, EventArgs e)

        private void button3_Click(object sender, EventArgs e)
            textBox2.Text = "";

        private void button4_Click(object sender, EventArgs e)

        private void button5_Click(object sender, EventArgs e)

        private void button6_Click(object sender, EventArgs e)

        private void SelText()
            var filename = textBox1.Text;
            using (PdfDocument doc = PdfDocument.Load(filename))
                for (var pageNum = 0; pageNum < doc.PageCount; pageNum++)
                    string text = doc.GetPdfText(pageNum);
                    textBox2.Text = textBox2.Text + text + "\r\n";


        private void Henkan()
             SaveFileDialog sfd = new SaveFileDialog();

            sfd.FileName = "新しいファイル.jpg";
            sfd.InitialDirectory = System.Environment.CurrentDirectory + @"\";
            sfd.Filter = "JPEGファイル(*.jpg)|*.jpg|すべてのファイル(*.*)|*.*";
            sfd.FilterIndex = 1;
            sfd.Title = "保存先のファイルを選択してください";
            sfd.RestoreDirectory = true;
            sfd.OverwritePrompt = true;
            sfd.CheckPathExists = true;

                IMG_DPI = Int32.Parse(textBox3.Text);
                JPEG_QUALITY = long.Parse(textBox4.Text);


            if (sfd.ShowDialog() == DialogResult.OK)

                //int resolution = 300;
                //string ReadFileName = textBox1.Text;
                //PdfiumViewer.PdfDocument pdfdoc = PdfiumViewer.PdfDocument.Load(ReadFileName);

                //SizeF size = pdfdoc.PageSizes[0];
                //int w = (int)size.Width * resolution / 72;
                //int h = (int)size.Height * resolution / 72;
                //Image image = pdfdoc.Render(0, w, h, resolution, resolution, false);
                //image.Save(sfd.FileName, System.Drawing.Imaging.ImageFormat.Jpeg);

                string img_dir = Path.GetDirectoryName(sfd.FileName);//(pdf_path);
                string img_fn_head = Path.GetFileNameWithoutExtension(sfd.FileName) + IMG_PATH_SEP;//(pdf_path) + IMG_PATH_SEP;
                string img_path_head = Path.Combine(img_dir, img_fn_head);

                string img_path_001 = this.getImgPath(img_path_head, 1);

                if (File.Exists(img_path_001))
                    DateTime dt_pdf = File.GetLastWriteTime(pdf_path);
                    DateTime dt_img = File.GetLastWriteTime(img_path_001);

                    if (dt_pdf < dt_img)

                using (PdfiumViewer.PdfDocument pdf_doc = PdfiumViewer.PdfDocument.Load(pdf_path))
                    for (int page = 0; page < pdf_doc.PageCount; page++)
                        SizeF pageSize = pdf_doc.PageSizes[page];
                        double ppi = IMG_DPI * 0.75;
                        int width = (int)(pageSize.Width * IMG_DPI / ppi);
                        int height = (int)(pageSize.Height * IMG_DPI / ppi);
                        using (Image img_obj = pdf_doc.Render(page, width, height, IMG_DPI, IMG_DPI, false))
                            string img_path_full = this.getImgPath(img_path_head, page + 1);

                            this.saveJpeg(img_obj, img_path_full, JPEG_QUALITY);


        private string getImgPath(string head, int tail_num)
            return this.getImgPath(head, tail_num.ToString("D3"));
        private string getImgPath(string head, string tail)
            return head + tail + IMG_PATH_EXT;

        private void saveJpeg(Image img_obj, string path, long quality)
            if (this.jpegEncoder == null)
                img_obj.Save(path, ImageFormat.Jpeg);
                EncoderParameter encParam = new EncoderParameter(System.Drawing.Imaging.Encoder.Quality, quality);
                EncoderParameters encParams = new EncoderParameters(1);
                encParams.Param[0] = encParam;

                img_obj.Save(path, this.jpegEncoder, encParams);

        private ImageCodecInfo jpegEncoder = null;
        private void getImageCodecInfo_Jpeg()
            this.jpegEncoder = null;
            foreach (ImageCodecInfo ici in ImageCodecInfo.GetImageEncoders())
                if (ici.FormatID == ImageFormat.Jpeg.Guid)
                    this.jpegEncoder = ici;

        private void save()
            SaveFileDialog sfd = new SaveFileDialog();

            sfd.FileName = "新しいファイル.txt";
            sfd.InitialDirectory = System.Environment.CurrentDirectory + @"\";
            sfd.Filter = "Textファイル(*.txt)|*.txt|すべてのファイル(*.*)|*.*";
            sfd.FilterIndex = 1;
            sfd.Title = "保存先のファイルを選択してください";
            sfd.RestoreDirectory = true;
            sfd.OverwritePrompt = true;
            sfd.CheckPathExists = true;

            if (sfd.ShowDialog() == DialogResult.OK)
                System.IO.Stream stream;
                stream = sfd.OpenFile();
                if (stream != null)
                    System.IO.StreamWriter sw = new System.IO.StreamWriter(stream);

        public async void OcrAll()
            string ImagePath = textBox1.Text;
            string ocrResult = await PerformOCR(ImagePath);

            textBox2.Text = textBox2.Text + "\r\n" + ocrResult.Replace(" ", "");

        private async Task<string> PerformOCR(string imagePath)
                byte[] imageBytes = System.IO.File.ReadAllBytes(imagePath);

                IBuffer buffer = imageBytes.AsBuffer();

                SoftwareBitmap softwareBitmap;
                using (var stream = new InMemoryRandomAccessStream())
                    await stream.WriteAsync(buffer);
                    var decoder = await Windows.Graphics.Imaging.BitmapDecoder.CreateAsync(stream);
                    softwareBitmap = await decoder.GetSoftwareBitmapAsync();

                var language = new Windows.Globalization.Language("ja");
                OcrEngine ocrEngine = OcrEngine.TryCreateFromUserProfileLanguages();//OcrEngine.TryCreateFromLanguage(language); //

                OcrResult ocrResult = await ocrEngine.RecognizeAsync(softwareBitmap);

                //string recognizedText = ocrResult.Text;
                string recognizedText = ocrResult.Lines.Select(line => line.Text).Aggregate((current, next) => current + Environment.NewLine + next);
                return recognizedText;
            catch (Exception ex)
                return "OCRエラー" + ex.Message;

        private void button7_Click(object sender, EventArgs e)

        private void ocr処理ToolStripMenuItem_Click(object sender, EventArgs e)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Windows.Forms;

public class clsResize
    List<System.Drawing.Rectangle> _arr_control_storage = new List<System.Drawing.Rectangle>();
    private bool showRowHeader = false;
    public clsResize(Form _form_)
        form = _form_; //the calling form
        _formSize = _form_.ClientSize; //Save initial form size
        _fontsize = _form_.Font.Size; //Font size

        var _controls = _get_all_controls(form);//call the enumerator
        FontTable = new Dictionary<string, float>();
        ControlTable = new Dictionary<string, System.Drawing.Rectangle>();
        foreach (Control control in _controls) //Loop through the controls
            FontTable.Add(control.Name, control.Font.Size);
            ControlTable.Add(control.Name, control.Bounds);


    Dictionary<string, float> FontTable;
    Dictionary<string, System.Drawing.Rectangle> ControlTable;

    private float _fontsize { get; set; }

    private System.Drawing.SizeF _formSize { get; set; }

    private Form form { get; set; }

    public void _get_initial_size() //get initial size//
        var _controls = _get_all_controls(form);//call the enumerator
        foreach (Control control in _controls) //Loop through the controls
            _arr_control_storage.Add(control.Bounds); //saves control bounds/dimension            
            //If you have datagridview
            if (control.GetType() == typeof(DataGridView))
                _dgv_Column_Adjust(((DataGridView)control), showRowHeader);
    public void _resize() //Set the resize
        double _form_ratio_width = (double)form.ClientSize.Width / (double)_formSize.Width; //ratio could be greater or less than 1
        double _form_ratio_height = (double)form.ClientSize.Height / (double)_formSize.Height; // this one too
        var _controls = _get_all_controls(form); //reenumerate the control collection
        int _pos = -1;//do not change this value unless you know what you are doing
        foreach (Control control in _controls)

            this._fontsize = FontTable[control.Name]; //<-取得したコントロールのフォントサイズ値で上書きするためにこれを追加

            // do some math calc
            _pos += 1;//increment by 1;
            System.Drawing.Size _controlSize = new System.Drawing.Size((int)(_arr_control_storage[_pos].Width * _form_ratio_width),
                (int)(_arr_control_storage[_pos].Height * _form_ratio_height)); //use for sizing

            System.Drawing.Point _controlposition = new System.Drawing.Point((int)
            (_arr_control_storage[_pos].X * _form_ratio_width), (int)(_arr_control_storage[_pos].Y * _form_ratio_height));//use for location

            //set bounds
            control.Bounds = new System.Drawing.Rectangle(_controlposition, _controlSize); //Put together

            //Assuming you have a datagridview inside a form()
            //if you want to show the row header, replace the false statement of 
            //showRowHeader on top/public declaration to true;
            if (control.GetType() == typeof(DataGridView))
                _dgv_Column_Adjust(((DataGridView)control), showRowHeader);

            //Font AutoSize
            control.Font = new System.Drawing.Font(form.Font.FontFamily,
             (float)(((Convert.ToDouble(_fontsize) * _form_ratio_width) / 2) +
              ((Convert.ToDouble(_fontsize) * _form_ratio_height) / 2)));


    private void _dgv_Column_Adjust(DataGridView dgv, bool _showRowHeader) //if you have Datagridview 
    //and want to resize the column base on its dimension.
        int intRowHeader = 0;
        const int Hscrollbarwidth = 5;
        if (_showRowHeader)
            intRowHeader = dgv.RowHeadersWidth;
            dgv.RowHeadersVisible = false;

        for (int i = 0; i < dgv.ColumnCount; i++)
            if (dgv.Dock == DockStyle.Fill) //in case the datagridview is docked
                dgv.Columns[i].Width = ((dgv.Width - intRowHeader) / dgv.ColumnCount);
                dgv.Columns[i].Width = ((dgv.Width - intRowHeader - Hscrollbarwidth) / dgv.ColumnCount);

    private static IEnumerable<Control> _get_all_controls(Control c)
        return c.Controls.Cast<Control>().SelectMany(item =>
            _get_all_controls(item)).Concat(c.Controls.Cast<Control>()).Where(control =>
            control.Name != string.Empty);


