OutOfMemoryException loading data via JPA: Need help analyzing - mysql

I wrote an application (Springboot + Data JPA + Data Rest) that keeps throwing OutOfMemoryException at me when the application loads. I can skip that code that runs on application start but then the exception may happen later down the road. It's probably best to show you what happens on application start because it's actually super simple and should not cause any problems imho:
#SpringBootApplication
#EnableAsync
#EnableJpaAuditing
public class ScraperApplication {
public static void main(String[] args) {
SpringApplication.run(ScraperApplication.class, args);
}
}
#Component
#RequiredArgsConstructor(onConstructor = #__(#Autowired))
public class DefaultDataLoader {
private final #NonNull LuceneService luceneService;
#Transactional
#EventListener(ApplicationReadyEvent.class)
public void load() {
luceneService.reindexData();
}
}
#Service
#RequiredArgsConstructor(onConstructor = #__(#Autowired))
public class LuceneService {
private static final Log LOG = LogFactory.getLog(LuceneService.class);
private final #NonNull TrainingRepo trainingRepo;
private final #NonNull EntityManager entityManager;
public void reindexData() {
LOG.info("Reindexing triggered");
FullTextEntityManager fullTextEntityManager = Search.getFullTextEntityManager(entityManager);
fullTextEntityManager.purgeAll(Training.class);
LOG.info("Index purged");
int page = 0;
int size = 100;
boolean morePages = true;
Page<Training> pageData;
while (morePages) {
pageData = trainingRepo.findAll(PageRequest.of(page, size));
LOG.info("Loading page " + (page + 1) + "/" + pageData.getTotalPages());
pageData.getContent().stream().forEach(t -> fullTextEntityManager.index(t));
fullTextEntityManager.flushToIndexes(); // flush regularly to keep memory footprint low
morePages = pageData.getTotalPages() > ++page;
}
fullTextEntityManager.flushToIndexes();
LOG.info("Index flushed");
}
}
You can see what I am doing is clear out the index, read all Trainings from the TrainingRepo in a paged way (100 at a time) and write them into the index. Not much going on actually. A few minutes after the "Index purged" message I get this - and only this:
java.lang.OutOfMemoryError: Java heap space
In the logs I get to see "Index purged" but never see any "Loading page ..." message, so it must be stuck on the findAll() call.
I had the JVM write a heap dump and loaded it into Eclipse Memory Analyzer and got a full stack trace: https://gist.github.com/mathias-ewald/2fddb9762427374bb04d332bd0b6b499
I also looked around the report a bit, but I need help interpreting this information which is why I attached some screenshots from Eclipse Memory Analyzer.
EDIT:
I just enabled "show-sql" and saw this before everything hung:
Hibernate: select training0_.id as id1_9_, training0_.created_date as created_2_9_, training0_.description as descript3_9_, training0_.duration_days as duration4_9_, training0_.execution_id as executi14_9_, training0_.level as level5_9_, training0_.modified_date as modified6_9_, training0_.name as name7_9_, training0_.price as price8_9_, training0_.product as product9_9_, training0_.quality as quality10_9_, training0_.raw as raw11_9_, training0_.url as url12_9_, training0_.vendor as vendor13_9_ from training training0_ where not (exists (select 1 from training training1_ where training0_.url=training1_.url and training0_.created_date<training1_.created_date)) limit ?
Hibernate: select execution0_.id as id1_1_0_, execution0_.created_date as created_2_1_0_, execution0_.duration_millis as duration3_1_0_, execution0_.message as message4_1_0_, execution0_.modified_date as modified5_1_0_, execution0_.scraper as scraper6_1_0_, execution0_.stats_id as stats_id8_1_0_, execution0_.status as status7_1_0_, properties1_.execution_id as executio1_2_1_, properties1_.properties as properti2_2_1_, properties1_.properties_key as properti3_1_, stats2_.id as id1_5_2_, stats2_.avg_quality as avg_qual2_5_2_, stats2_.max_quality as max_qual3_5_2_, stats2_.min_quality as min_qual4_5_2_, stats2_.null_products as null_pro5_5_2_, stats2_.null_vendors as null_ven6_5_2_, stats2_.products as products7_5_2_, stats2_.tags as tags8_5_2_, stats2_.trainings as training9_5_2_, stats2_.vendors as vendors10_5_2_, producthis3_.stats_id as stats_id1_6_3_, producthis3_.product_histogram as product_2_6_3_, producthis3_.product_histogram_key as product_3_3_, taghistogr4_.stats_id as stats_id1_7_4_, taghistogr4_.tag_histogram as tag_hist2_7_4_, taghistogr4_.tag_histogram_key as tag_hist3_4_, vendorhist5_.stats_id as stats_id1_8_5_, vendorhist5_.vendor_histogram as vendor_h2_8_5_, vendorhist5_.vendor_histogram_key as vendor_h3_5_ from execution execution0_ left outer join execution_properties properties1_ on execution0_.id=properties1_.execution_id left outer join stats stats2_ on execution0_.stats_id=stats2_.id left outer join stats_product_histogram producthis3_ on stats2_.id=producthis3_.stats_id left outer join stats_tag_histogram taghistogr4_ on stats2_.id=taghistogr4_.stats_id left outer join stats_vendor_histogram vendorhist5_ on stats2_.id=vendorhist5_.stats_id where execution0_.id=?
Apparently, it creates the statement to fetch all the Training entities but the Execution statement is the last it manages to execute.
I changed the relation from Training to Execution from #ManyToOne to #ManyToOne(fetch = FetchType.LAZY) and suddenly I the code was able to load data into the index again. So I am thinking something might be wrong with my Execution entity mapping. Let me share the code with you:
#Entity
#Data
#EntityListeners(AuditingEntityListener.class)
public class Execution {
public enum Status { SCHEDULED, RUNNING, SUCCESS, FAILURE };
#Id
#GeneratedValue(strategy = GenerationType.IDENTITY)
#ToString.Include
private Long id;
#Column(updatable = false)
private String scraper;
#CreatedDate
private LocalDateTime createdDate;
#LastModifiedDate
private LocalDateTime modifiedDate;
#Min(0)
#JsonProperty(access = Access.READ_ONLY)
private Long durationMillis;
#ElementCollection(fetch = FetchType.EAGER)
private Map<String, String> properties;
#NotNull
#Enumerated(EnumType.STRING)
private Status status;
#Column(length = 9999999)
private String message;
#EqualsAndHashCode.Exclude
#OneToOne(cascade = CascadeType.ALL)
private Stats stats;
}
And since it is a relation of Execution, here's the Stats entity, too:
#Entity
#Data
public class Stats {
#Id
#GeneratedValue(strategy = GenerationType.IDENTITY)
#ToString.Include
private Long id;
private Long trainings;
private Long vendors;
private Long products;
private Long tags;
private Long nullVendors;
private Long nullProducts;
private Double minQuality;
private Double avgQuality;
private Double maxQuality;
#ElementCollection(fetch = FetchType.EAGER)
private Map<String, Long> vendorHistogram;
#ElementCollection(fetch = FetchType.EAGER)
private Map<String, Long> productHistogram;
#ElementCollection(fetch = FetchType.EAGER)
private Map<String, Long> tagHistogram;
}

All this is running in a single transaction and I can't see a clear here, so the EntityManager loading all this data still references it.
To fix this inject the EntityManager and invoke clear. Or alternatively make the scope of the transaction the processing of one page.
I recommend the TransactionTemplate for this.
I'm not familiar with the FullTextEntityManager but it might have similar problems.
For more background you might want to read up on the JPA entity lifecycle.

I believe it has to do with your FullTextEntityManager not finding enough memory. You have to configure your queryPlanCache.Go through this thread on how to Stackoverflow and this one too.

Related

JPA Specification multiple join based on foreignkey

I have following relationships between three objects
public class ProductEntity {
#Id
private int id;
#OneToMany(mappedBy = "productEntity",
fetch = FetchType.LAZY)
private List<ProductInfoEntity> productInfoEntityList = new ArrayList<>();
#Column(name = "snippet")
private String snippet;
}
public class ProductInfoEntity {
#Id
private int id;
#ManyToOne
#JoinColumn(name = "product_id")
private ProductEntity productEntity;
#ManyToOne
#JoinColumn(name = "support_language_id")
private SupportLanguageEntity supportLanguageEntity;
}
public class SupportLanguageEntity {
#Id
private int id;
#Column("name")
private String name;
}
And this is actual database design
Then, I'd like to make a specification to query as followings:
select * from product_info
where product_id = 1
and support_language_id = 2;
I am also using annotation for the specification which means that I use ProductEntity_, ProductInfoEntity_ and so on.
Can you please give me a full working code for the specification for query mentioned above?
Thank you guys
To use Specification your ProductInfoEntityRepository have to extend JpaSpecificationExecutor
#Repository
public interface ProductInfoEntityRepository
extends JpaRepository<ProductInfoEntity, Integer>, JpaSpecificationExecutor<ProductInfoEntity> {
}
As far as I understand you use JPA metamodel. So then
#Autowired
ProductInfoEntityRepository repository;
public List<ProductInfoEntity> findProductInfoEntities(int productId, int languageId) {
return repository.findAll((root, query, builder) -> {
Predicate productPredicate = builder.equal(
root.get(ProductInfoEntity_.productEntity).get(ProductEntity_.id), // or root.get("productEntity").get("id")
productId);
Predicate languagePredicate = builder.equal(
root.get(ProductInfoEntity_.supportLanguageEntity).get(SupportLanguageEntity_.id), // or root.get("supportLanguageEntity").get("id")
languageId);
return builder.and(productPredicate, languagePredicate);
});
}
If you want to make specifications reusable you should create utility class contains two static methods productIdEquals(int) and languageIdEquals(int).
To combine them use Specifications(Spring Data JPA 1.*) or Specification(since Spring Data JPA 2.0)
select * from product_info where product_id = 1 and support_language_id = 2;
Should work as written. But the only thing useful will be comment.
Perhaps you want the rest of the info in all three tables?
SELECT pi.comment, -- list the columns you need
p.snippet,
sl.name
FROM product AS p -- table, plus convenient "alias"
JOIN product_info AS pi -- another table
ON p.id = pi.product_info -- explain how the tables are related
JOIN support_language AS sl -- and another
ON pi.support_language_id = sl.id -- how related
WHERE p.snippet = 'abc' -- it is more likely that you will start here
-- The query will figure out the rest.
From there, see if you can work out the obfuscation provided by JPA.

How handle currents updates in spring-boot hibernate problem? Also need to make app scalable

Project type :- Spring-boot JPA project
Hi,
I have below Rest service which increments a number in database.
#RestController
public class IncrementController {
#Autowired
MyNumberRepository mynumberRepository;
#GetMapping(path="/incrementnumber")
public String incrementNumber(){
Optional<MyNumber> mynumber = mynumberRepository.findById(1);
int i = mynumber.get().getNumber();
System.out.println("value of no is "+i);
i = i+1;
System.out.println("value of no post increment is "+i);
mynumber.get().setNumber(i);
MyNumber entity = new MyNumber();
entity.setId(1);
entity.setNumber(i);
mynumberRepository.save(entity);
return "done";
}
}
Entity is as below :-
#Entity
#Table(name = "my_number")
public class MyNumber {
#Id
private Integer id;
private Integer number;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public Integer getNumber() {
return number;
}
public void setNumber(Integer number) {
this.number = number;
}
}
Below is the Repository :-
public interface MyNumberRepository extends JpaRepository<MyNumber, Integer>{
}
The service works well when I call increment number sequentially , but when concurrent threads call the incrementservice then i get non consistent results. How can I handle this situation ?
Also have to deploy the app on multiple places and connecting to same DB. i.e Scalability concern.
Thanks,
Rahul
You must use a pessimistic lock. This will issue a SELECT FOR UPDATE and lock the row for the transaction and it's not possible for another transaction to overwrite the row.
public interface MyNumberRepository extends JpaRepository<MyNumber, Integer> {
#Lock(LockModeType.PESSIMISTIC_WRITE)
Optional<MyNumber> findById(Integer id);
}
And then you have to make your REST method transactional by adding #Transactional
#RestController
public class IncrementController {
#Autowired
MyNumberRepository mynumberRepository;
#Transactional
#GetMapping(path="/incrementnumber")
public String incrementNumber(){
Optional<MyNumber> mynumber = mynumberRepository.findById(1);
int i = mynumber.get().getNumber();
System.out.println("value of no is "+i);
i = i+1;
System.out.println("value of no post increment is "+i);
mynumber.get().setNumber(i);
MyNumber entity = new MyNumber();
entity.setId(1);
entity.setNumber(i);
mynumberRepository.save(entity);
return "done";
}
}
Above solution will work , but i feel you are doing over-engineering for very simple problem.
My recommendation would be to use database sequence.I feel your requirement is quite straight forward.In your service u can simply call getnextvalue on the sequence and then set the value in the Id field.This way u don't have to manage locks also as Database will do that for you.
In oracle particularly sequences are managed in a different transactions . So if ur calling code fails with exception , still the value of sequence will be incremented . This will ensure that multi-threads will not see the same value of the sequence in case of exceptions.
Instead of locking transaction, you could also use an Oracle sequence or MySQL "AUTO_INCREMENT" feature which will prevent any ID being returned twice.
https://community.oracle.com/thread/4156674
Thread safety of MySql's Select Last_Insert_ID

saving large object takes too long on hibernate

I have an object with a Blob column requestData and a Text Column "requestDataText" .
These two fields may hold large Data. In my example , the blob data is around 1.2 MBs and the Text column holds the text equivalent of that Data.
When i try to commit this single entity , it takes around 20 seconds .
DBUtil.beginTransaction();
session.saveOrUpdate(entity);
DBUtil.commitTransaction();
Is there something wrong or is there a way to shorten this period ?
package a.db.entity;
// Generated Feb 22, 2016 11:57:10 AM by Hibernate Tools 3.2.1.GA
/**
* Foo generated by hbm2java
*/
#Entity
#Table(name="foo"
,catalog="bar"
)
public class Foo implements java.io.Serializable {
private Long id;
private Date reqDate;
private byte[] requestData;
private String requestDataText;
private String functionName;
private boolean confirmed;
private boolean processed;
private boolean errorOnProcess;
private Date processStartedAt;
private Date processFinishedAt;
private String responseText;
private String processResult;
private String miscData;
public AsyncRequestLog() {
}
#Id #GeneratedValue(strategy=IDENTITY)
#Column(name="Id", unique=true, nullable=false)
public Long getId() {
return this.id;
}
public void setId(Long id) {
this.id = id;
}
...
}
I just noticed you're starting a transaction and then doing a saveOrUpdate() which might explain the slow down, as hibernate will try to retrieve the row from the DB first (as explained on this other SO answer).
If you know if the entity is new call save() and if you the entity has to be updated call update().
Another suggestion, but I'm not sure if this applies any more to MySQL, try to store the blobs/clobs in a different table from where you store the data, if you are intending to update the blob/clobs. In the past this mix made MySQL run slow as it had to resize the 'block' allocated to a row. So have one table with all the attributes and a different table just for the blob/clob. This is not the case if the table is read-only.

Hibernate deletion issue with a bidirectional association

I am using Spring Data JPA (1.7.2-RELEASE) in combination with Hibernate (4.3.8.Final) and MySQL (5.5). I want to manage two entities in a bidirectional assosciation. The save and update of the enties works fine, but the deletion doesn't work.
#Entity
public class Beacon extends AbstractEntity {
#OneToMany(fetch = FetchType.EAGER, mappedBy = "beacon", cascade = ALL)
private Set<Comment> comments;
/**
* #return the comments
*/
public Set<Comment> getComments() {
return comments;
}
/**
* #param comments the comments to set
*/
public void setComments(Set<Comment> comments) {
this.comments = comments;
}
}
and
#Entity
public class Comment extends AbstractEntity {
#ManyToOne(fetch = FetchType.EAGER)
#JoinColumn(name = "beacon_id")
private Beacon beacon;
public Beacon getBeacon() {
return beacon;
}
public void setBeacon(Beacon beacon) {
this.beacon = beacon;
}
}
Having a beacon with comments stored in the database, I want to delete the comment but it doesn't work. I don't get an exception but the entity is still present in the database.
This is my unit test:
#Test
public void deleteWithStrategyCheck() {
Beacon beacon = this.beaconRepository.save(createBeacon());
Comment comment = this.commentRepository.save(createEntity());
comment.setBeacon(beacon);
comment = this.commentRepository.save(comment);
this.commentRepository.delete(comment.getId());
assertThat(this.commentRepository.exists(comment.getId())).isFalse();
assertThat(this.beaconRepository.exists(beacon.getId())).isTrue();
assertThat(this.beaconRepository.findOne(beacon.getId()).getComments()).doesNotContain(comment);
}
If I delete the comment via a sql statement it works.
You need to add orphanRemoval = true to your #OneToMany mappings, and remove the Comment from the parrent beacon.
If you delete the Comment without removing it from the parrent collection you should actually get the exception (unless you are not using InnoDB storage engine, (and ou should)).
beacon.getComments().remove(comment),
will do the work then. (with orphanRemoval you don't need to call EM.remove(comment). Without it, you need to remove the comment from the collection and call EM.remove(comment).

The most efficient way to store photo reference in a database

I'm currently looking to store approximately 3.5 million photo's from approximately 100/200k users. I'm only using a mysql database on aws. My question is in regards to the most efficient way to store the photo reference. I'm only aware of two ways and I'm looking for an expert opinion.
Choice A
A user table with a photo_url column, in that column I would build a comma separated list of photo's that both maintain the name and sort order. The business logic would handle extracting the path from the photo name and append photo size. The downside is the processing expense.
Database example
"0ea102, e435b9, etc"
Business logic would build the following urls from photo name
/0e/a1/02.jpg
/0e/a1/02_thumb.jpg
/e4/35/b9.jpg
/e4/35/b9_thumb.jpg
Choice B - Relational Table joined on user table with the following fields. I'm just concerned I may have potential database performance issues.
pk
user_id
photo_url_800
photo_url_150
photo_url_45
order
Does anybody have any suggestions on the better solution?
The best and most common answer would be: choice B - Relational Table joined on user table with the following fields.
id
order
user_id
desc
photo_url_800
photo_url_150
photo_url_45
date_uploaded
Or a hybrid, wherein, you store the file names individually and add the photo directory with your business logic layer.
My analysis, your first option is a bad practice. Comma separated fields are not advisable for database. It would be difficult for you to update these fields and add description on it.
Regarding the table optimization, you might want to see these articles:
Optimizing MyISAM Queries
Optimizing InnoDB Queries
Here is an example of my final solution using the hibernate ORM, Christian Mark, and my hybrid solution.
#Entity
public class Photo extends StatefulEntity {
private static final String FILE_EXTENSION_JPEG = ".jpg";
private static final String ROOT_PHOTO_URL = "/photo/";
private static final String PHOTO_SIZE_800 = "_800";
private static final String PHOTO_SIZE_150 = "_150";
private static final String PHOTO_SIZE_100 = "_100";
private static final String PHOTO_SIZE_50 = "_50";
#ManyToOne
#JoinColumn(name = "profile_id", nullable = false)
private Profile profile;
//Example "a1d2b0" which will later get parsed into "/photo/a1/d2/b0_size.jpg"
//using the generatePhotoUrl business logic below.
#Column(nullable = false, length = 6)
private String fileName;
private boolean temp;
#Column(nullable = false)
private int orderBy;
#Temporal(TemporalType.TIMESTAMP)
private Date dateUploaded;
public Profile getProfile() {
return profile;
}
public void setProfile(Profile profile) {
this.profile = profile;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public Date getDateUploaded() {
return dateUploaded;
}
public void setDateUploaded(Date dateUploaded) {
this.dateUploaded = dateUploaded;
}
public boolean isTemp() {
return temp;
}
public void setTemp(boolean temp) {
this.temp = temp;
}
public int getOrderBy() {
return orderBy;
}
public void setOrderBy(int orderBy) {
this.orderBy = orderBy;
}
public String getPhotoSize800() {
return generatePhotoURL(PHOTO_SIZE_800);
}
public String getPhotoSize150() {
return generatePhotoURL(PHOTO_SIZE_150);
}
public String getPhotoSize100() {
return generatePhotoURL(PHOTO_SIZE_100);
}
public String getPhotoSize50() {
return generatePhotoURL(PHOTO_SIZE_50);
}
private String generatePhotoURL(String photoSize) {
String firstDir = getFileName().substring(0, 2);
String secondDir = getFileName().substring(2, 4);
String photoName = getFileName().substring(4, 6);
StringBuilder sb = new StringBuilder();
sb.append(ROOT_PHOTO_URL);
sb.append("/");
sb.append(firstDir);
sb.append("/");
sb.append(secondDir);
sb.append("/");
sb.append(photoName);
sb.append(photoSize);
sb.append(FILE_EXTENSION_JPEG);
return sb.toString();
}
}