In one of our previous post we have explained about how to read excel using Javascript and read csv using Javascript, but in this post I have provided working example to read content of pdf file in Javascript. For this example, we will be using PDF.js to extract pdf content.
Read PDF text using JavaScript
As stated above, we will be using pdf.js for reading pdf file using Javascript, for this we will be using pdf.js 1.10 version, which is much easier to use and stable, here are the steps which we will be taking to read pdf contents.
- First, we will convert PDF file contents into
ArrayBuffer
- ArrayBuffer is passed to PDF.js, and read text using
getDocument()
- Each page is data is extracted using
getPage()
- Each page text is extracted using
textContent.items
Let's begin by adding require Javscript file and creating required HTMl to browse PDF file
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.min.js" ></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.6.347/pdf.worker.entry.min.js" ></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.worker.min.js" ></script>
<input type="file" id="file-id" name="file_name" onchange="ExtractText();">
<!-- a container for the output -->
<div id="output"></div>
Once the file is browsed and selected we are calling JS function ExtractText()
Here is the complete Javascript, code which will be used
var datass = '';
var DataArr = [];
PDFJS.workerSrc = '';
function ExtractText() {
var input = document.getElementById("file-id");
var fReader = new FileReader();
fReader.readAsDataURL(input.files[0]);
// console.log(input.files[0]);
fReader.onloadend = function (event) {
convertDataURIToBinary(event.target.result);
}
}
var BASE64_MARKER = ';base64,';
function convertDataURIToBinary(dataURI) {
var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length;
var base64 = dataURI.substring(base64Index);
var raw = window.atob(base64);
var rawLength = raw.length;
var array = new Uint8Array(new ArrayBuffer(rawLength));
for (var i = 0; i < rawLength; i++) {
array[i] = raw.charCodeAt(i);
}
pdfAsArray(array)
}
function getPageText(pageNum, PDFDocumentInstance) {
// Return a Promise that is solved once the text of the page is retrieven
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
// The main trick to obtain the text of the PDF page, use the getTextContent method
pdfPage.getTextContent().then(function (textContent) {
var textItems = textContent.items;
var finalString = "";
// Concatenate the string of the item to the final string
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
// Solve promise with the text retrieven from the page
resolve(finalString);
});
});
});
}
function pdfAsArray(pdfAsArray) {
PDFJS.getDocument(pdfAsArray).then(function (pdf) {
var pdfDocument = pdf;
// Create an array that will contain our promises
var pagesPromises = [];
for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
// Required to prevent that i is always the total of pages
(function (pageNumber) {
// Store the promise of getPageText that returns the text of a page
pagesPromises.push(getPageText(pageNumber, pdfDocument));
})(i + 1);
}
// Execute all the promises
Promise.all(pagesPromises).then(function (pagesText) {
// Display text of all the pages in the console
// e.g ["Text content page 1", "Text content page 2", "Text content page 3" ... ]
console.log(pagesText); // representing every single page of PDF Document by array indexing
console.log(pagesText.length);
var outputStr = "";
for (var pageNum = 0; pageNum < pagesText.length; pageNum++) {
console.log(pagesText[pageNum]);
outputStr = "";
outputStr = "<br/><br/>Page " + (pageNum + 1) + " contents <br/> <br/>";
var div = document.getElementById('output');
div.innerHTML += (outputStr + pagesText[pageNum]);
}
});
}, function (reason) {
// PDF loading error
console.error(reason);
});
}
This is our Sample PDF which will use to test this example, it has 2 pages as shown in the below image
I have explained many part of the code using comments.
Complete HTML/Javascript will look like this
<html>
<body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.6.347/pdf.worker.entry.min.js" ></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/1.10.100/pdf.worker.min.js"></script>
<input type="file" id="file-id" name="file_name" onchange="ExtractText();">
<!-- a container for the output -->
<div id="output"></div>
<script>
var datass = '';
var DataArr = [];
PDFJS.workerSrc = '';
function ExtractText() {
var input = document.getElementById("file-id");
var fReader = new FileReader();
fReader.readAsDataURL(input.files[0]);
// console.log(input.files[0]);
fReader.onloadend = function (event) {
convertDataURIToBinary(event.target.result);
}
}
var BASE64_MARKER = ';base64,';
function convertDataURIToBinary(dataURI) {
var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length;
var base64 = dataURI.substring(base64Index);
var raw = window.atob(base64);
var rawLength = raw.length;
var array = new Uint8Array(new ArrayBuffer(rawLength));
for (var i = 0; i < rawLength; i++) {
array[i] = raw.charCodeAt(i);
}
pdfAsArray(array)
}
function getPageText(pageNum, PDFDocumentInstance) {
// Return a Promise that is solved once the text of the page is retrieven
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
// The main trick to obtain the text of the PDF page, use the getTextContent method
pdfPage.getTextContent().then(function (textContent) {
var textItems = textContent.items;
var finalString = "";
// Concatenate the string of the item to the final string
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
// Solve promise with the text retrieven from the page
resolve(finalString);
});
});
});
}
function pdfAsArray(pdfAsArray) {
PDFJS.getDocument(pdfAsArray).then(function (pdf) {
var pdfDocument = pdf;
// Create an array that will contain our promises
var pagesPromises = [];
for (var i = 0; i < pdf.pdfInfo.numPages; i++) {
// Required to prevent that i is always the total of pages
(function (pageNumber) {
// Store the promise of getPageText that returns the text of a page
pagesPromises.push(getPageText(pageNumber, pdfDocument));
})(i + 1);
}
// Execute all the promises
Promise.all(pagesPromises).then(function (pagesText) {
// Display text of all the pages in the console
// e.g ["Text content page 1", "Text content page 2", "Text content page 3" ... ]
console.log(pagesText); // representing every single page of PDF Document by array indexing
console.log(pagesText.length);
var outputStr = "";
for (var pageNum = 0; pageNum < pagesText.length; pageNum++) {
console.log(pagesText[pageNum]);
outputStr = "";
outputStr = "<br/><br/>Page " + (pageNum + 1) + " contents <br/> <br/>";
var div = document.getElementById('output');
div.innerHTML += (outputStr + pagesText[pageNum]);
}
});
}, function (reason) {
// PDF loading error
console.error(reason);
});
}
</script>
</body>
</html>
Once we are done, we can use the above code in our browser, and you will see output as below
Complete Fiddle sample
As you can see from above example output, we were able to extract PDF contents using Javascript and show all the text.
You may also like to read:
Convert Image to base64 string using Javascript
Solving Error "JsonException: A possible object cycle was detected" .NET Core
vikas_jk
Hello, Thanks for your query.
This is the part of code which is fetching Text
Modify this line of code, according to your requirements.
Thanks.