-
Notifications
You must be signed in to change notification settings - Fork 444
Description
HI,
I am using Tabula-java to parse Table in pdf file, but it is skipping one row. Alternative row is fetching perperly.
I have attached my pdf file named murree_ren.pdf.
murree_ren.pdf
This is the code I have used:
public void parse() {
System.out.println("TabulaPdfParser.parse-----------------------------------");
try {
File file = new File("D:/Pdfs/murree_ren.pdf");
FileInputStream inputStream = new FileInputStream(file);
PDDocument document = PDDocument.load(inputStream);
{
System.out.println("TabulaPdfParser.parse--------------------document loaded---------------");
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
@SuppressWarnings("resource")
PageIterator pi = new ObjectExtractor(document).extract();
while (pi.hasNext()) {
// iterate over the pages of the document
Page page = pi.next();
List<Table> tables = sea.extract(page);
System.out.println("TabulaPdfParser.parse------------------||||-----------------table size=" + tables.size());
// iterate over the tables of the page
for(Table table: tables) {
List<List<RectangularTextContainer>> rows = table.getRows();
System.out.println("TabulaPdfParser.parse------------getRowCount=" + table.getRowCount() + " colcount=" + table.getColCount());
String str = "";
RectangularTextContainer rect;
int rowcount = table.getRowCount();
int colcount = table.getColCount();
for (int i=0; i<rowcount; i++) {
str = "";
for (int j=0; j<colcount; j++) {
rect = table.getCell(i, j);
str = str + rect.getText().replace("\r", " ");
if (j < (colcount-1)) {
str += "|";
}
}
System.out.println("RowText:----------row no=" + i + " str=" + str);
}
}
}
}
} catch (Exception ex) {
System.out.println("Exception:---------------------------------------=" + ex.getMessage());
}
}
Here is the output:
TabulaPdfParser.parse-----------------------------------
TabulaPdfParser.parse--------------------document loaded---------------
TabulaPdfParser.parse------------------||||-----------------table size=109
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=7
RowText:----------row no=0 str=1|39167 24/11/2019|MUHAMMAD MATLOOB|MUHAMMAD TAJ|VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|6/6/1976|F.A
RowText:----------row no=1 str=||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILL DAHLA TEH MURREE P.O KUNDAN, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=7
RowText:----------row no=0 str=3|455 18/06/2020|WAHEED ANWAR|ABDUL QADOUS|P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB|12/9/1954|MATRIC
RowText:----------row no=1 str=||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=WAHEED ANWAR|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=ABDUL QADOUS
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=P.O. AUSIA TEH MURREE, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=5
RowText:----------row no=0 str=||||
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=5|61134 2020-12-08|AZRA PARVEEN|MUHAMMAD TALIB|V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|30/10/1966|MATRIC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=61134 2020-12-08
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=61134 2020-12-08
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=AZRA PARVEEN|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=V P O AUSIA TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=8
RowText:----------row no=0 str=7|60305 25/01/2021|AZRA NAHEED|MANZOOR HUSSAIN|BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|17/03/1973|MATRIC|
RowText:----------row no=1 str=|||||||
RowText:----------row no=2 str=|||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MANZOOR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MANZOOR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=BAN P.O KHAS TEH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=3 colcount=9
RowText:----------row no=0 str=9|59463 31/07/2021|MUNAWAR HUSSAIN|MUHAMMAD ABDULLAH|H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|01/03/1974|MATRIC||
RowText:----------row no=1 str=||||||||
RowText:----------row no=2 str=||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUNAWAR HUSSAIN
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUHAMMAD ABDULLAH
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=H-705/1 MOH IMAM BARGAH MURREE DISTT RAWALPINDI, RAWALPINDI, PUNJAB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=11|58306 10/01/2022|MUBASHAR ISHAQ QAMAR|MUHAMMAD ISHAQ|VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|26/02/1992|FSC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MUBASHAR ISHAQ QAMAR
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=VILLAGE DARYA GALI P.O KHAS DARYA GALI TEHSIL MURREE DISTT., RAWALPINDI, PUNJAB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=FSC
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=3 colcount=12
RowText:----------row no=0 str=13|49597 19/06/2022|TAHIR MEHBOOB|MUHAMMAD MEHBOOB|GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB|15/7/1988|ICS , FSC HOMEO|||||
RowText:----------row no=1 str=|||||||||||
RowText:----------row no=2 str=|||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=2
RowText:----------row no=0 str=TAHIR MEHBOOB|
RowText:----------row no=1 str=|
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=TAHIR MEHBOOB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=TAHIR MEHBOOB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=4
RowText:----------row no=0 str=MUHAMMAD MEHBOOB|||
RowText:----------row no=1 str=|||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=GHORA GALI P.O KHAS TEH MURREE , RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=ICS , FSC HOMEO
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=3
RowText:----------row no=0 str=||
TabulaPdfParser.parse------------getRowCount=1 colcount=1
RowText:----------row no=0 str=
TabulaPdfParser.parse------------getRowCount=1 colcount=2
RowText:----------row no=0 str=|
TabulaPdfParser.parse------------getRowCount=3 colcount=11
RowText:----------row no=0 str=15|53404 19/03/2023|MUHAMMAD AJMAL MALIK|MAHMOOD AHMED MALIK|VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB|26/2/1961|MATRIC||||
RowText:----------row no=1 str=||||||||||
RowText:----------row no=2 str=||||||||||
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=MUHAMMAD AJMAL MALIK||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=MAHMOOD AHMED MALIK
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||
TabulaPdfParser.parse------------getRowCount=2 colcount=1
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB
RowText:----------row no=1 str=
TabulaPdfParser.parse------------getRowCount=2 colcount=3
RowText:----------row no=0 str=VILLAGE P.O AUSUA TEH MUREE DISTT, RAWALPINDI, PUNJAB||
RowText:----------row no=1 str=||