001/** 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.activemq.store.kahadb.scheduler; 018 019import java.io.DataInput; 020import java.io.DataOutput; 021import java.io.File; 022import java.io.FilenameFilter; 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.Iterator; 029import java.util.List; 030import java.util.Map; 031import java.util.Map.Entry; 032import java.util.Set; 033import java.util.TreeSet; 034import java.util.UUID; 035 036import org.apache.activemq.broker.scheduler.JobScheduler; 037import org.apache.activemq.broker.scheduler.JobSchedulerStore; 038import org.apache.activemq.protobuf.Buffer; 039import org.apache.activemq.store.kahadb.AbstractKahaDBStore; 040import org.apache.activemq.store.kahadb.JournalCommand; 041import org.apache.activemq.store.kahadb.KahaDBMetaData; 042import org.apache.activemq.store.kahadb.Visitor; 043import org.apache.activemq.store.kahadb.data.KahaAddScheduledJobCommand; 044import org.apache.activemq.store.kahadb.data.KahaDestroySchedulerCommand; 045import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobCommand; 046import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobsCommand; 047import org.apache.activemq.store.kahadb.data.KahaRescheduleJobCommand; 048import org.apache.activemq.store.kahadb.data.KahaTraceCommand; 049import org.apache.activemq.store.kahadb.disk.index.BTreeVisitor; 050import org.apache.activemq.store.kahadb.disk.journal.DataFile; 051import org.apache.activemq.store.kahadb.disk.journal.Location; 052import org.apache.activemq.store.kahadb.disk.page.Page; 053import org.apache.activemq.store.kahadb.disk.page.PageFile; 054import org.apache.activemq.store.kahadb.disk.page.Transaction; 055import org.apache.activemq.store.kahadb.disk.util.VariableMarshaller; 056import org.apache.activemq.store.kahadb.scheduler.legacy.LegacyStoreReplayer; 057import org.apache.activemq.util.ByteSequence; 058import org.apache.activemq.util.IOHelper; 059import org.slf4j.Logger; 060import org.slf4j.LoggerFactory; 061 062public class JobSchedulerStoreImpl extends AbstractKahaDBStore implements JobSchedulerStore { 063 064 private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerStoreImpl.class); 065 066 private JobSchedulerKahaDBMetaData metaData = new JobSchedulerKahaDBMetaData(this); 067 private final MetaDataMarshaller metaDataMarshaller = new MetaDataMarshaller(this); 068 private final Map<String, JobSchedulerImpl> schedulers = new HashMap<String, JobSchedulerImpl>(); 069 private File legacyStoreArchiveDirectory; 070 071 /** 072 * The Scheduler Token is used to identify base revisions of the Scheduler store. A store 073 * based on the initial scheduler design will not have this tag in it's meta-data and will 074 * indicate an update is needed. Later versions of the scheduler can also change this value 075 * to indicate incompatible store bases which require complete meta-data and journal rewrites 076 * instead of simpler meta-data updates. 077 */ 078 static final UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409"); 079 080 /** 081 * The default scheduler store version. All new store instance will be given this version and 082 * earlier versions will be updated to this version. 083 */ 084 static final int CURRENT_VERSION = 1; 085 086 @Override 087 public JobScheduler getJobScheduler(final String name) throws Exception { 088 this.indexLock.writeLock().lock(); 089 try { 090 JobSchedulerImpl result = this.schedulers.get(name); 091 if (result == null) { 092 final JobSchedulerImpl js = new JobSchedulerImpl(this); 093 js.setName(name); 094 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 095 @Override 096 public void execute(Transaction tx) throws IOException { 097 js.createIndexes(tx); 098 js.load(tx); 099 metaData.getJobSchedulers().put(tx, name, js); 100 } 101 }); 102 result = js; 103 this.schedulers.put(name, js); 104 if (isStarted()) { 105 result.start(); 106 } 107 this.pageFile.flush(); 108 } 109 return result; 110 } finally { 111 this.indexLock.writeLock().unlock(); 112 } 113 } 114 115 @Override 116 public boolean removeJobScheduler(final String name) throws Exception { 117 boolean result = false; 118 119 this.indexLock.writeLock().lock(); 120 try { 121 final JobSchedulerImpl js = this.schedulers.remove(name); 122 result = js != null; 123 if (result) { 124 js.stop(); 125 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 126 @Override 127 public void execute(Transaction tx) throws IOException { 128 metaData.getJobSchedulers().remove(tx, name); 129 js.removeAll(tx); 130 } 131 }); 132 } 133 } finally { 134 this.indexLock.writeLock().unlock(); 135 } 136 return result; 137 } 138 139 /** 140 * Sets the directory where the legacy scheduler store files are archived before an 141 * update attempt is made. Both the legacy index files and the journal files are moved 142 * to this folder prior to an upgrade attempt. 143 * 144 * @param directory 145 * The directory to move the legacy Scheduler Store files to. 146 */ 147 public void setLegacyStoreArchiveDirectory(File directory) { 148 this.legacyStoreArchiveDirectory = directory; 149 } 150 151 /** 152 * Gets the directory where the legacy Scheduler Store files will be archived if the 153 * broker is started and an existing Job Scheduler Store from an old version is detected. 154 * 155 * @return the directory where scheduler store legacy files are archived on upgrade. 156 */ 157 public File getLegacyStoreArchiveDirectory() { 158 if (this.legacyStoreArchiveDirectory == null) { 159 this.legacyStoreArchiveDirectory = new File(getDirectory(), "legacySchedulerStore"); 160 } 161 162 return this.legacyStoreArchiveDirectory.getAbsoluteFile(); 163 } 164 165 @Override 166 public void load() throws IOException { 167 if (opened.compareAndSet(false, true)) { 168 getJournal().start(); 169 try { 170 loadPageFile(); 171 } catch (UnknownStoreVersionException ex) { 172 LOG.info("Can't start until store update is performed."); 173 upgradeFromLegacy(); 174 // Restart with the updated store 175 getJournal().start(); 176 loadPageFile(); 177 LOG.info("Update from legacy Scheduler store completed successfully."); 178 } catch (Throwable t) { 179 LOG.warn("Index corrupted. Recovering the index through journal replay. Cause: {}", t.toString()); 180 LOG.debug("Index load failure", t); 181 182 // try to recover index 183 try { 184 pageFile.unload(); 185 } catch (Exception ignore) { 186 } 187 if (isArchiveCorruptedIndex()) { 188 pageFile.archive(); 189 } else { 190 pageFile.delete(); 191 } 192 metaData = new JobSchedulerKahaDBMetaData(this); 193 pageFile = null; 194 loadPageFile(); 195 } 196 startCheckpoint(); 197 recover(); 198 } 199 LOG.info("{} started.", this); 200 } 201 202 @Override 203 public void unload() throws IOException { 204 if (opened.compareAndSet(true, false)) { 205 for (JobSchedulerImpl js : this.schedulers.values()) { 206 try { 207 js.stop(); 208 } catch (Exception e) { 209 throw new IOException(e); 210 } 211 } 212 this.indexLock.writeLock().lock(); 213 try { 214 if (pageFile != null && pageFile.isLoaded()) { 215 metaData.setState(KahaDBMetaData.CLOSED_STATE); 216 217 if (metaData.getPage() != null) { 218 pageFile.tx().execute(new Transaction.Closure<IOException>() { 219 @Override 220 public void execute(Transaction tx) throws IOException { 221 tx.store(metaData.getPage(), metaDataMarshaller, true); 222 } 223 }); 224 } 225 } 226 } finally { 227 this.indexLock.writeLock().unlock(); 228 } 229 230 checkpointLock.writeLock().lock(); 231 try { 232 if (metaData.getPage() != null) { 233 checkpointUpdate(true); 234 } 235 } finally { 236 checkpointLock.writeLock().unlock(); 237 } 238 synchronized (checkpointThreadLock) { 239 if (checkpointThread != null) { 240 try { 241 checkpointThread.join(); 242 checkpointThread = null; 243 } catch (InterruptedException e) { 244 } 245 } 246 } 247 248 if (pageFile != null) { 249 pageFile.unload(); 250 pageFile = null; 251 } 252 if (this.journal != null) { 253 journal.close(); 254 journal = null; 255 } 256 257 metaData = new JobSchedulerKahaDBMetaData(this); 258 } 259 LOG.info("{} stopped.", this); 260 } 261 262 private void loadPageFile() throws IOException { 263 this.indexLock.writeLock().lock(); 264 try { 265 final PageFile pageFile = getPageFile(); 266 pageFile.load(); 267 pageFile.tx().execute(new Transaction.Closure<IOException>() { 268 @Override 269 public void execute(Transaction tx) throws IOException { 270 if (pageFile.getPageCount() == 0) { 271 Page<JobSchedulerKahaDBMetaData> page = tx.allocate(); 272 assert page.getPageId() == 0; 273 page.set(metaData); 274 metaData.setPage(page); 275 metaData.setState(KahaDBMetaData.CLOSED_STATE); 276 metaData.initialize(tx); 277 tx.store(metaData.getPage(), metaDataMarshaller, true); 278 } else { 279 Page<JobSchedulerKahaDBMetaData> page = null; 280 page = tx.load(0, metaDataMarshaller); 281 metaData = page.get(); 282 metaData.setPage(page); 283 } 284 metaData.load(tx); 285 metaData.loadScheduler(tx, schedulers); 286 for (JobSchedulerImpl js : schedulers.values()) { 287 try { 288 js.start(); 289 } catch (Exception e) { 290 JobSchedulerStoreImpl.LOG.error("Failed to load " + js.getName(), e); 291 } 292 } 293 } 294 }); 295 296 pageFile.flush(); 297 } finally { 298 this.indexLock.writeLock().unlock(); 299 } 300 } 301 302 private void upgradeFromLegacy() throws IOException { 303 304 journal.close(); 305 journal = null; 306 try { 307 pageFile.unload(); 308 pageFile = null; 309 } catch (Exception ignore) {} 310 311 File storeDir = getDirectory().getAbsoluteFile(); 312 File storeArchiveDir = getLegacyStoreArchiveDirectory(); 313 314 LOG.info("Attempting to move old store files from {} to {}", storeDir, storeArchiveDir); 315 316 // Move only the known store files, locks and other items left in place. 317 IOHelper.moveFiles(storeDir, storeArchiveDir, new FilenameFilter() { 318 319 @Override 320 public boolean accept(File dir, String name) { 321 if (name.endsWith(".data") || name.endsWith(".redo") || name.endsWith(".log") || name.endsWith(".free")) { 322 return true; 323 } 324 return false; 325 } 326 }); 327 328 // We reset everything to clean state, then we can read from the old 329 // scheduler store and replay the scheduled jobs into this one as adds. 330 getJournal().start(); 331 metaData = new JobSchedulerKahaDBMetaData(this); 332 pageFile = null; 333 loadPageFile(); 334 335 LegacyStoreReplayer replayer = new LegacyStoreReplayer(getLegacyStoreArchiveDirectory()); 336 replayer.load(); 337 replayer.startReplay(this); 338 339 // Cleanup after replay and store what we've done. 340 pageFile.tx().execute(new Transaction.Closure<IOException>() { 341 @Override 342 public void execute(Transaction tx) throws IOException { 343 tx.store(metaData.getPage(), metaDataMarshaller, true); 344 } 345 }); 346 347 checkpointUpdate(true); 348 getJournal().close(); 349 getPageFile().unload(); 350 } 351 352 @Override 353 protected void checkpointUpdate(Transaction tx, boolean cleanup) throws IOException { 354 LOG.debug("Job Scheduler Store Checkpoint started."); 355 356 // reflect last update exclusive of current checkpoint 357 Location lastUpdate = metaData.getLastUpdateLocation(); 358 metaData.setState(KahaDBMetaData.OPEN_STATE); 359 tx.store(metaData.getPage(), metaDataMarshaller, true); 360 pageFile.flush(); 361 362 if (cleanup) { 363 final TreeSet<Integer> completeFileSet = new TreeSet<Integer>(journal.getFileMap().keySet()); 364 final TreeSet<Integer> gcCandidateSet = new TreeSet<Integer>(completeFileSet); 365 366 LOG.trace("Last update: {}, full gc candidates set: {}", lastUpdate, gcCandidateSet); 367 368 if (lastUpdate != null) { 369 gcCandidateSet.remove(lastUpdate.getDataFileId()); 370 } 371 372 this.metaData.getJournalRC().visit(tx, new BTreeVisitor<Integer, Integer>() { 373 374 @Override 375 public void visit(List<Integer> keys, List<Integer> values) { 376 for (Integer key : keys) { 377 if (gcCandidateSet.remove(key)) { 378 LOG.trace("Removed referenced file: {} from GC set", key); 379 } 380 } 381 } 382 383 @Override 384 public boolean isInterestedInKeysBetween(Integer first, Integer second) { 385 return true; 386 } 387 }); 388 389 LOG.trace("gc candidates after reference check: {}", gcCandidateSet); 390 391 // If there are GC candidates then check the remove command location to see 392 // if any of them can go or if they must stay in order to ensure proper recover. 393 // 394 // A log containing any remove commands must be kept until all the logs with the 395 // add commands for all the removed jobs have been dropped. 396 if (!gcCandidateSet.isEmpty()) { 397 Iterator<Entry<Integer, List<Integer>>> removals = metaData.getRemoveLocationTracker().iterator(tx); 398 List<Integer> orphans = new ArrayList<Integer>(); 399 while (removals.hasNext()) { 400 boolean orphanedRemove = true; 401 Entry<Integer, List<Integer>> entry = removals.next(); 402 403 // If this log is not a GC candidate then there's no need to do a check to rule it out 404 if (gcCandidateSet.contains(entry.getKey())) { 405 for (Integer addLocation : entry.getValue()) { 406 if (completeFileSet.contains(addLocation)) { 407 LOG.trace("A remove in log {} has an add still in existance in {}.", entry.getKey(), addLocation); 408 orphanedRemove = false; 409 break; 410 } 411 } 412 413 // If it's not orphaned than we can't remove it, otherwise we 414 // stop tracking it it's log will get deleted on the next check. 415 if (!orphanedRemove) { 416 gcCandidateSet.remove(entry.getKey()); 417 } else { 418 LOG.trace("All removes in log {} are orphaned, file can be GC'd", entry.getKey()); 419 orphans.add(entry.getKey()); 420 } 421 } 422 } 423 424 // Drop all orphaned removes from the tracker. 425 for (Integer orphan : orphans) { 426 metaData.getRemoveLocationTracker().remove(tx, orphan); 427 } 428 } 429 430 LOG.trace("gc candidates after removals check: {}", gcCandidateSet); 431 if (!gcCandidateSet.isEmpty()) { 432 if (LOG.isDebugEnabled()) { 433 LOG.debug("Cleanup removing the data files: " + gcCandidateSet); 434 } 435 journal.removeDataFiles(gcCandidateSet); 436 } 437 } 438 439 LOG.debug("Job Scheduler Store Checkpoint complete."); 440 } 441 442 /** 443 * Adds a reference for the journal log file pointed to by the given Location value. 444 * 445 * To prevent log files in the journal that still contain valid data that needs to be 446 * kept in order to allow for recovery the logs must have active references. Each Job 447 * scheduler should ensure that the logs are accurately referenced. 448 * 449 * @param tx 450 * The TX under which the update is to be performed. 451 * @param location 452 * The location value to update the reference count of. 453 * 454 * @throws IOException if an error occurs while updating the journal references table. 455 */ 456 protected void incrementJournalCount(Transaction tx, Location location) throws IOException { 457 int logId = location.getDataFileId(); 458 Integer val = metaData.getJournalRC().get(tx, logId); 459 int refCount = val != null ? val.intValue() + 1 : 1; 460 metaData.getJournalRC().put(tx, logId, refCount); 461 } 462 463 /** 464 * Removes one reference for the Journal log file indicated in the given Location value. 465 * 466 * The references are used to track which log files cannot be GC'd. When the reference count 467 * on a log file reaches zero the file id is removed from the tracker and the log will be 468 * removed on the next check point update. 469 * 470 * @param tx 471 * The TX under which the update is to be performed. 472 * @param location 473 * The location value to update the reference count of. 474 * 475 * @throws IOException if an error occurs while updating the journal references table. 476 */ 477 protected void decrementJournalCount(Transaction tx, Location location) throws IOException { 478 int logId = location.getDataFileId(); 479 Integer refCount = metaData.getJournalRC().get(tx, logId); 480 if (refCount != null) { 481 int refCountValue = refCount; 482 refCountValue--; 483 if (refCountValue <= 0) { 484 metaData.getJournalRC().remove(tx, logId); 485 } else { 486 metaData.getJournalRC().put(tx, logId, refCountValue); 487 } 488 } 489 } 490 491 /** 492 * Removes multiple references for the Journal log file indicated in the given Location map. 493 * 494 * The references are used to track which log files cannot be GC'd. When the reference count 495 * on a log file reaches zero the file id is removed from the tracker and the log will be 496 * removed on the next check point update. 497 * 498 * @param tx 499 * The TX under which the update is to be performed. 500 * @param decrementsByFileIds 501 * Map indicating how many decrements per fileId. 502 * 503 * @throws IOException if an error occurs while updating the journal references table. 504 */ 505 protected void decrementJournalCount(Transaction tx, HashMap<Integer, Integer> decrementsByFileIds) throws IOException { 506 for(Map.Entry<Integer, Integer> entry : decrementsByFileIds.entrySet()) { 507 int logId = entry.getKey(); 508 Integer refCount = metaData.getJournalRC().get(tx, logId); 509 510 if (refCount != null) { 511 int refCountValue = refCount; 512 refCountValue -= entry.getValue(); 513 if (refCountValue <= 0) { 514 metaData.getJournalRC().remove(tx, logId); 515 } else { 516 metaData.getJournalRC().put(tx, logId, refCountValue); 517 } 518 } 519 } 520 } 521 522 /** 523 * Updates the Job removal tracking index with the location of a remove command and the 524 * original JobLocation entry. 525 * 526 * The JobLocation holds the locations in the logs where the add and update commands for 527 * a job stored. The log file containing the remove command can only be discarded after 528 * both the add and latest update log files have also been discarded. 529 * 530 * @param tx 531 * The TX under which the update is to be performed. 532 * @param location 533 * The location value to reference a remove command. 534 * @param removedJob 535 * The original JobLocation instance that holds the add and update locations 536 * 537 * @throws IOException if an error occurs while updating the remove location tracker. 538 */ 539 protected void referenceRemovedLocation(Transaction tx, Location location, JobLocation removedJob) throws IOException { 540 int logId = location.getDataFileId(); 541 List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId); 542 if (removed == null) { 543 removed = new ArrayList<Integer>(); 544 } 545 removed.add(removedJob.getLocation().getDataFileId()); 546 this.metaData.getRemoveLocationTracker().put(tx, logId, removed); 547 } 548 549 /** 550 * Updates the Job removal tracking index with the location of a remove command and the 551 * original JobLocation entry. 552 * 553 * The JobLocation holds the locations in the logs where the add and update commands for 554 * a job stored. The log file containing the remove command can only be discarded after 555 * both the add and latest update log files have also been discarded. 556 * 557 * @param tx 558 * The TX under which the update is to be performed. 559 * @param location 560 * The location value to reference a remove command. 561 * @param removedJobsFileId 562 * List of the original JobLocation instances that holds the add and update locations 563 * 564 * @throws IOException if an error occurs while updating the remove location tracker. 565 */ 566 protected void referenceRemovedLocation(Transaction tx, Location location, List<Integer> removedJobsFileId) throws IOException { 567 int logId = location.getDataFileId(); 568 List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId); 569 if (removed == null) { 570 removed = new ArrayList<Integer>(); 571 } 572 removed.addAll(removedJobsFileId); 573 this.metaData.getRemoveLocationTracker().put(tx, logId, removed); 574 } 575 576 /** 577 * Retrieve the scheduled Job's byte blob from the journal. 578 * 579 * @param location 580 * The location of the KahaAddScheduledJobCommand that originated the Job. 581 * 582 * @return a ByteSequence containing the payload of the scheduled Job. 583 * 584 * @throws IOException if an error occurs while reading the payload value. 585 */ 586 protected ByteSequence getPayload(Location location) throws IOException { 587 KahaAddScheduledJobCommand job = (KahaAddScheduledJobCommand) this.load(location); 588 Buffer payload = job.getPayload(); 589 return new ByteSequence(payload.getData(), payload.getOffset(), payload.getLength()); 590 } 591 592 public void readLockIndex() { 593 this.indexLock.readLock().lock(); 594 } 595 596 public void readUnlockIndex() { 597 this.indexLock.readLock().unlock(); 598 } 599 600 public void writeLockIndex() { 601 this.indexLock.writeLock().lock(); 602 } 603 604 public void writeUnlockIndex() { 605 this.indexLock.writeLock().unlock(); 606 } 607 608 @Override 609 public String toString() { 610 return "JobSchedulerStore: " + getDirectory(); 611 } 612 613 @Override 614 protected String getPageFileName() { 615 return "scheduleDB"; 616 } 617 618 @Override 619 protected File getDefaultDataDirectory() { 620 return new File(IOHelper.getDefaultDataDirectory(), "delayedDB"); 621 } 622 623 private class MetaDataMarshaller extends VariableMarshaller<JobSchedulerKahaDBMetaData> { 624 625 private final JobSchedulerStoreImpl store; 626 627 MetaDataMarshaller(JobSchedulerStoreImpl store) { 628 this.store = store; 629 } 630 631 @Override 632 public JobSchedulerKahaDBMetaData readPayload(DataInput dataIn) throws IOException { 633 JobSchedulerKahaDBMetaData rc = new JobSchedulerKahaDBMetaData(store); 634 rc.read(dataIn); 635 return rc; 636 } 637 638 @Override 639 public void writePayload(JobSchedulerKahaDBMetaData object, DataOutput dataOut) throws IOException { 640 object.write(dataOut); 641 } 642 } 643 644 /** 645 * Called during index recovery to rebuild the index from the last known good location. For 646 * entries that occur before the last known good position we just ignore then and move on. 647 * 648 * @param command 649 * the command read from the Journal which should be used to update the index. 650 * @param location 651 * the location in the index where the command was read. 652 * @param inDoubtlocation 653 * the location in the index known to be the last time the index was valid. 654 * 655 * @throws IOException if an error occurs while recovering the index. 656 */ 657 protected void doRecover(JournalCommand<?> data, final Location location, final Location inDoubtlocation) throws IOException { 658 if (inDoubtlocation != null && location.compareTo(inDoubtlocation) >= 0) { 659 process(data, location); 660 } 661 } 662 663 /** 664 * Called during recovery to allow the store to rebuild from scratch. 665 * 666 * @param data 667 * The command to process, which was read from the Journal. 668 * @param location 669 * The location of the command in the Journal. 670 * 671 * @throws IOException if an error occurs during command processing. 672 */ 673 @Override 674 protected void process(JournalCommand<?> data, final Location location) throws IOException { 675 data.visit(new Visitor() { 676 @Override 677 public void visit(final KahaAddScheduledJobCommand command) throws IOException { 678 final JobSchedulerImpl scheduler; 679 680 indexLock.writeLock().lock(); 681 try { 682 try { 683 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 684 } catch (Exception e) { 685 throw new IOException(e); 686 } 687 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 688 @Override 689 public void execute(Transaction tx) throws IOException { 690 scheduler.process(tx, command, location); 691 } 692 }); 693 694 processLocation(location); 695 } finally { 696 indexLock.writeLock().unlock(); 697 } 698 } 699 700 @Override 701 public void visit(final KahaRemoveScheduledJobCommand command) throws IOException { 702 final JobSchedulerImpl scheduler; 703 704 indexLock.writeLock().lock(); 705 try { 706 try { 707 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 708 } catch (Exception e) { 709 throw new IOException(e); 710 } 711 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 712 @Override 713 public void execute(Transaction tx) throws IOException { 714 scheduler.process(tx, command, location); 715 } 716 }); 717 718 processLocation(location); 719 } finally { 720 indexLock.writeLock().unlock(); 721 } 722 } 723 724 @Override 725 public void visit(final KahaRemoveScheduledJobsCommand command) throws IOException { 726 final JobSchedulerImpl scheduler; 727 728 indexLock.writeLock().lock(); 729 try { 730 try { 731 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 732 } catch (Exception e) { 733 throw new IOException(e); 734 } 735 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 736 @Override 737 public void execute(Transaction tx) throws IOException { 738 scheduler.process(tx, command, location); 739 } 740 }); 741 742 processLocation(location); 743 } finally { 744 indexLock.writeLock().unlock(); 745 } 746 } 747 748 @Override 749 public void visit(final KahaRescheduleJobCommand command) throws IOException { 750 final JobSchedulerImpl scheduler; 751 752 indexLock.writeLock().lock(); 753 try { 754 try { 755 scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler()); 756 } catch (Exception e) { 757 throw new IOException(e); 758 } 759 getPageFile().tx().execute(new Transaction.Closure<IOException>() { 760 @Override 761 public void execute(Transaction tx) throws IOException { 762 scheduler.process(tx, command, location); 763 } 764 }); 765 766 processLocation(location); 767 } finally { 768 indexLock.writeLock().unlock(); 769 } 770 } 771 772 @Override 773 public void visit(final KahaDestroySchedulerCommand command) { 774 try { 775 removeJobScheduler(command.getScheduler()); 776 } catch (Exception e) { 777 LOG.warn("Failed to remove scheduler: {}", command.getScheduler()); 778 } 779 780 processLocation(location); 781 } 782 783 @Override 784 public void visit(KahaTraceCommand command) { 785 processLocation(location); 786 } 787 }); 788 } 789 790 protected void processLocation(final Location location) { 791 indexLock.writeLock().lock(); 792 try { 793 this.metaData.setLastUpdateLocation(location); 794 } finally { 795 indexLock.writeLock().unlock(); 796 } 797 } 798 799 /** 800 * We recover from the Journal logs as needed to restore the index. 801 * 802 * @throws IllegalStateException 803 * @throws IOException 804 */ 805 private void recover() throws IllegalStateException, IOException { 806 this.indexLock.writeLock().lock(); 807 try { 808 long start = System.currentTimeMillis(); 809 Location lastIndoubtPosition = getRecoveryPosition(); 810 Location recoveryPosition = lastIndoubtPosition; 811 812 if (recoveryPosition != null) { 813 int redoCounter = 0; 814 LOG.info("Recovering from the scheduled job journal @" + recoveryPosition); 815 while (recoveryPosition != null) { 816 try { 817 JournalCommand<?> message = load(recoveryPosition); 818 metaData.setLastUpdateLocation(recoveryPosition); 819 doRecover(message, recoveryPosition, lastIndoubtPosition); 820 redoCounter++; 821 } catch (IOException failedRecovery) { 822 if (isIgnoreMissingJournalfiles()) { 823 LOG.debug("Failed to recover data at position:" + recoveryPosition, failedRecovery); 824 // track this dud location 825 journal.corruptRecoveryLocation(recoveryPosition); 826 } else { 827 throw new IOException("Failed to recover data at position:" + recoveryPosition, failedRecovery); 828 } 829 } 830 recoveryPosition = journal.getNextLocation(recoveryPosition); 831 if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) { 832 LOG.info("@ {}, {} entries recovered ..", recoveryPosition, redoCounter); 833 } 834 } 835 long end = System.currentTimeMillis(); 836 LOG.info("Recovery replayed {} operations from the journal in {} seconds.", 837 redoCounter, ((end - start) / 1000.0f)); 838 } 839 840 // We may have to undo some index updates. 841 pageFile.tx().execute(new Transaction.Closure<IOException>() { 842 @Override 843 public void execute(Transaction tx) throws IOException { 844 recoverIndex(tx); 845 } 846 }); 847 848 } finally { 849 this.indexLock.writeLock().unlock(); 850 } 851 } 852 853 private Location getRecoveryPosition() throws IOException { 854 // This loads the first position and we completely rebuild the index if we 855 // do not override it with some known recovery start location. 856 Location result = null; 857 858 if (!isForceRecoverIndex()) { 859 if (metaData.getLastUpdateLocation() != null) { 860 result = metaData.getLastUpdateLocation(); 861 } 862 } 863 864 return journal.getNextLocation(result); 865 } 866 867 private void recoverIndex(Transaction tx) throws IOException { 868 long start = System.currentTimeMillis(); 869 870 // It is possible index updates got applied before the journal updates.. 871 // in that case we need to removed references to Jobs that are not in the journal 872 final Location lastAppendLocation = journal.getLastAppendLocation(); 873 long undoCounter = 0; 874 875 // Go through all the jobs in each scheduler and check if any are added after 876 // the last appended location and remove those. For now we ignore the update 877 // location since the scheduled job will update itself after the next fire and 878 // a new update will replace any existing update. 879 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 880 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 881 JobSchedulerImpl scheduler = entry.getValue(); 882 883 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 884 final JobLocation job = jobLocationIterator.next(); 885 if (job.getLocation().compareTo(lastAppendLocation) >= 0) { 886 if (scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime())) { 887 LOG.trace("Removed Job past last appened in the journal: {}", job.getJobId()); 888 undoCounter++; 889 } 890 } 891 } 892 } 893 894 if (undoCounter > 0) { 895 // The rolled back operations are basically in flight journal writes. To avoid getting 896 // these the end user should do sync writes to the journal. 897 long end = System.currentTimeMillis(); 898 LOG.info("Rolled back {} messages from the index in {} seconds.", undoCounter, ((end - start) / 1000.0f)); 899 undoCounter = 0; 900 } 901 902 // Now we check for missing and corrupt journal files. 903 904 // 1. Collect the set of all referenced journal files based on the Location of the 905 // the scheduled jobs and the marked last update field. 906 HashSet<Integer> missingJournalFiles = new HashSet<Integer>(); 907 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 908 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 909 JobSchedulerImpl scheduler = entry.getValue(); 910 911 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 912 final JobLocation job = jobLocationIterator.next(); 913 missingJournalFiles.add(job.getLocation().getDataFileId()); 914 if (job.getLastUpdate() != null) { 915 missingJournalFiles.add(job.getLastUpdate().getDataFileId()); 916 } 917 } 918 } 919 920 // 2. Remove from that set all known data file Id's in the journal and what's left 921 // is the missing set which will soon also contain the corrupted set. 922 missingJournalFiles.removeAll(journal.getFileMap().keySet()); 923 if (!missingJournalFiles.isEmpty()) { 924 LOG.info("Some journal files are missing: {}", missingJournalFiles); 925 } 926 927 // 3. Now check all references in the journal logs for corruption and add any 928 // corrupt journal files to the missing set. 929 HashSet<Location> corruptedLocations = new HashSet<Location>(); 930 931 if (isCheckForCorruptJournalFiles()) { 932 Collection<DataFile> dataFiles = journal.getFileMap().values(); 933 for (DataFile dataFile : dataFiles) { 934 int id = dataFile.getDataFileId(); 935 for (long offset : dataFile.getCorruptedBlocks()) { 936 corruptedLocations.add(new Location(id, (int) offset)); 937 } 938 } 939 940 if (!corruptedLocations.isEmpty()) { 941 LOG.debug("Found some corrupted data blocks in the journal: {}", corruptedLocations.size()); 942 } 943 } 944 945 // 4. Now we either fail or we remove all references to missing or corrupt journal 946 // files from the various JobSchedulerImpl instances. We only remove the Job if 947 // the initial Add operation is missing when the ignore option is set, the updates 948 // could be lost but that's price you pay when ignoring the missing logs. 949 if (!missingJournalFiles.isEmpty() || !corruptedLocations.isEmpty()) { 950 if (!isIgnoreMissingJournalfiles()) { 951 throw new IOException("Detected missing/corrupt journal files."); 952 } 953 954 // Remove all Jobs that reference an Location that is either missing or corrupt. 955 undoCounter = removeJobsInMissingOrCorruptJounralFiles(tx, missingJournalFiles, corruptedLocations); 956 957 // Clean up the Journal Reference count Map. 958 removeJournalRCForMissingFiles(tx, missingJournalFiles); 959 } 960 961 if (undoCounter > 0) { 962 long end = System.currentTimeMillis(); 963 LOG.info("Detected missing/corrupt journal files. Dropped {} jobs from the " + 964 "index in {} seconds.", undoCounter, ((end - start) / 1000.0f)); 965 } 966 } 967 968 private void removeJournalRCForMissingFiles(Transaction tx, Set<Integer> missing) throws IOException { 969 List<Integer> matches = new ArrayList<Integer>(); 970 971 Iterator<Entry<Integer, Integer>> references = metaData.getJournalRC().iterator(tx); 972 while (references.hasNext()) { 973 int dataFileId = references.next().getKey(); 974 if (missing.contains(dataFileId)) { 975 matches.add(dataFileId); 976 } 977 } 978 979 for (Integer match : matches) { 980 metaData.getJournalRC().remove(tx, match); 981 } 982 } 983 984 private int removeJobsInMissingOrCorruptJounralFiles(Transaction tx, Set<Integer> missing, Set<Location> corrupted) throws IOException { 985 int removed = 0; 986 987 // Remove Jobs that reference missing or corrupt files. 988 // Remove Reference counts to missing or corrupt files. 989 // Remove and remove command markers to missing or corrupt files. 990 for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) { 991 Map.Entry<String, JobSchedulerImpl> entry = i.next(); 992 JobSchedulerImpl scheduler = entry.getValue(); 993 994 for (Iterator<JobLocation> jobLocationIterator = scheduler.getAllScheduledJobs(tx); jobLocationIterator.hasNext();) { 995 final JobLocation job = jobLocationIterator.next(); 996 997 // Remove all jobs in missing log files. 998 if (missing.contains(job.getLocation().getDataFileId())) { 999 scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime()); 1000 removed++; 1001 continue; 1002 } 1003 1004 // Remove all jobs in corrupted parts of log files. 1005 if (corrupted.contains(job.getLocation())) { 1006 scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime()); 1007 removed++; 1008 } 1009 } 1010 } 1011 1012 return removed; 1013 } 1014}