In one of our previous post we have explained about how to read excel using Javascript and read csv using Javascript, but in this post I have provided working example to read content of pdf file in Javascript. For this example, we will be using PDF.js to extract pdf content.

Read PDF text using JavaScript

As stated above, we will be using pdf.js for reading pdf file using Javascript, for this we will be using pdf.js 1.10 version, which is much easier to use and stable, here are the steps which we will be taking to read pdf contents.

  • First, we will convert PDF file contents into ArrayBuffer
  • ArrayBuffer is passed to PDF.js, and read text using getDocument()
  • Each page is data is extracted using getPage()
  • Each page text is extracted using textContent.items

Let's begin by adding require Javscript file and creating required HTMl to browse PDF file

    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.min.js" ></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.6.347/pdf.worker.entry.min.js" ></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.worker.min.js" ></script>
    <input type="file" id="file-id" name="file_name" onchange="ExtractText();">


    <!-- a container for the output -->
    <div id="output"></div>
    

Once the file is browsed and selected we are calling JS function ExtractText()

Here is the complete Javascript, code which will be used

        var datass = '';
        var DataArr = [];
        PDFJS.workerSrc = '';

        function ExtractText() {
            var input = document.getElementById("file-id");
            var fReader = new FileReader();
            fReader.readAsDataURL(input.files[0]);
            // console.log(input.files[0]);
            fReader.onloadend = function (event) {
                convertDataURIToBinary(event.target.result);
            }
        }

        var BASE64_MARKER = ';base64,';

        function convertDataURIToBinary(dataURI) {

            var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length;
            var base64 = dataURI.substring(base64Index);
            var raw = window.atob(base64);
            var rawLength = raw.length;
            var array = new Uint8Array(new ArrayBuffer(rawLength));

            for (var i = 0; i < rawLength; i++) {
                array[i] = raw.charCodeAt(i);
            }
            pdfAsArray(array)

        }

        function getPageText(pageNum, PDFDocumentInstance) {
            // Return a Promise that is solved once the text of the page is retrieven
            return new Promise(function (resolve, reject) {
                PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
                    // The main trick to obtain the text of the PDF page, use the getTextContent method
                    pdfPage.getTextContent().then(function (textContent) {
                        var textItems = textContent.items;
                        var finalString = "";

                        // Concatenate the string of the item to the final string
                        for (var i = 0; i < textItems.length; i++) {
                            var item = textItems[i];

                            finalString += item.str + " ";
                        }

                        // Solve promise with the text retrieven from the page
                        resolve(finalString);
                    });
                });
            });
        }

        function pdfAsArray(pdfAsArray) {

            PDFJS.getDocument(pdfAsArray).then(function (pdf) {

                var pdfDocument = pdf;
                // Create an array that will contain our promises
                var pagesPromises = [];

                for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
                    // Required to prevent that i is always the total of pages
                    (function (pageNumber) {
                        // Store the promise of getPageText that returns the text of a page
                        pagesPromises.push(getPageText(pageNumber, pdfDocument));
                    })(i + 1);
                }

                // Execute all the promises
                Promise.all(pagesPromises).then(function (pagesText) {

                    // Display text of all the pages in the console
                    // e.g ["Text content page 1", "Text content page 2", "Text content page 3" ... ]
                    console.log(pagesText); // representing every single page of PDF Document by array indexing
                    console.log(pagesText.length);
                    var outputStr = "";
                    for (var pageNum = 0; pageNum < pagesText.length; pageNum++) {
                        console.log(pagesText[pageNum]);
                        outputStr = "";
                        outputStr = "<br/><br/>Page " + (pageNum + 1) + " contents <br/> <br/>";

                        var div = document.getElementById('output');

                        div.innerHTML += (outputStr + pagesText[pageNum]);

                    }




                });

            }, function (reason) {
                // PDF loading error
                console.error(reason);
            });
        }

This is our Sample PDF which will use to test this example, it has 2 pages as shown in the below image

sample-pdf-file-page-contents-extract-using-javascript-min.png

I have explained many part of the code using comments.

Complete HTML/Javascript will look like this

<html>
<body>

    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.6.347/pdf.worker.entry.min.js" ></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.worker.min.js"></script>
    <input type="file" id="file-id" name="file_name" onchange="ExtractText();">


    <!-- a container for the output -->
    <div id="output"></div>

    <script>
        var datass = '';
        var DataArr = [];
        PDFJS.workerSrc = '';

        function ExtractText() {
            var input = document.getElementById("file-id");
            var fReader = new FileReader();
            fReader.readAsDataURL(input.files[0]);
            // console.log(input.files[0]);
            fReader.onloadend = function (event) {
                convertDataURIToBinary(event.target.result);
            }
        }

        var BASE64_MARKER = ';base64,';

        function convertDataURIToBinary(dataURI) {

            var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length;
            var base64 = dataURI.substring(base64Index);
            var raw = window.atob(base64);
            var rawLength = raw.length;
            var array = new Uint8Array(new ArrayBuffer(rawLength));

            for (var i = 0; i < rawLength; i++) {
                array[i] = raw.charCodeAt(i);
            }
            pdfAsArray(array)

        }

        function getPageText(pageNum, PDFDocumentInstance) {
            // Return a Promise that is solved once the text of the page is retrieven
            return new Promise(function (resolve, reject) {
                PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
                    // The main trick to obtain the text of the PDF page, use the getTextContent method
                    pdfPage.getTextContent().then(function (textContent) {
                        var textItems = textContent.items;
                        var finalString = "";

                        // Concatenate the string of the item to the final string
                        for (var i = 0; i < textItems.length; i++) {
                            var item = textItems[i];

                            finalString += item.str + " ";
                        }

                        // Solve promise with the text retrieven from the page
                        resolve(finalString);
                    });
                });
            });
        }

        function pdfAsArray(pdfAsArray) {

            PDFJS.getDocument(pdfAsArray).then(function (pdf) {

                var pdfDocument = pdf;
                // Create an array that will contain our promises
                var pagesPromises = [];

                for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
                    // Required to prevent that i is always the total of pages
                    (function (pageNumber) {
                        // Store the promise of getPageText that returns the text of a page
                        pagesPromises.push(getPageText(pageNumber, pdfDocument));
                    })(i + 1);
                }

                // Execute all the promises
                Promise.all(pagesPromises).then(function (pagesText) {

                    // Display text of all the pages in the console
                    // e.g ["Text content page 1", "Text content page 2", "Text content page 3" ... ]
                    console.log(pagesText); // representing every single page of PDF Document by array indexing
                    console.log(pagesText.length);
                    var outputStr = "";
                    for (var pageNum = 0; pageNum < pagesText.length; pageNum++) {
                        console.log(pagesText[pageNum]);
                        outputStr = "";
                        outputStr = "<br/><br/>Page " + (pageNum + 1) + " contents <br/> <br/>";

                        var div = document.getElementById('output');

                        div.innerHTML += (outputStr + pagesText[pageNum]);

                    }




                });

            }, function (reason) {
                // PDF loading error
                console.error(reason);
            });
        }

    </script>
</body>

</html>

Once we are done, we can use the above code in our browser, and you will see output as below

extract-pdf-text-using-javascript-min.gif

Complete Fiddle sample

You may also like to read:

Read PDF in C# using iTextSharp

Where can I download Visual Studio ISO?